]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: add fma intrinsic for amd64
authorsmasher164 <aindurti@gmail.com>
Tue, 25 Sep 2018 07:10:33 +0000 (03:10 -0400)
committerKeith Randall <khr@golang.org>
Mon, 21 Oct 2019 16:42:10 +0000 (16:42 +0000)
To permit ssa-level optimization, this change introduces an amd64 intrinsic
that generates the VFMADD231SD instruction for the fused-multiply-add
operation on systems that support it. System support is detected via
cpu.X86.HasFMA. A rewrite rule can then translate the generic ssa intrinsic
("Fma") to VFMADD231SD.

The benchmark compares the software implementation (old) with the intrinsic
(new).

name   old time/op  new time/op  delta
Fma-4  27.2ns ± 1%   1.0ns ± 9%  -96.48%  (p=0.008 n=5+5)

Updates #25819.

Change-Id: I966655e5f96817a5d06dff5942418a3915b09584
Reviewed-on: https://go-review.googlesource.com/c/go/+/137156
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
12 files changed:
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/gc/builtin.go
src/cmd/compile/internal/gc/builtin/runtime.go
src/cmd/compile/internal/gc/go.go
src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/runtime/cpuflags.go
src/runtime/proc.go
test/codegen/math.go

index a82ed0995cd5cee98f6ad64daf627e72216de958..480ff6523ac969d77302ff3f3478a901034a4d5a 100644 (file)
@@ -164,6 +164,14 @@ func duff(size int64) (int64, int64) {
 
 func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
        switch v.Op {
+       case ssa.OpAMD64VFMADD231SD:
+               p := s.Prog(v.Op.Asm())
+               p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
+               p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
+               p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()})
+               if v.Reg() != v.Args[0].Reg() {
+                       v.Fatalf("input[0] and output not in same register %s", v.LongString())
+               }
        case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
                r := v.Reg()
                r1 := v.Args[0].Reg()
index a770356ea0472bd35048e547706565d825fed35c..c7cd5fae64f0eb95948295c0bc8277a40be4a98a 100644 (file)
@@ -185,6 +185,7 @@ var runtimeDecls = [...]struct {
        {"checkptrArithmetic", funcTag, 122},
        {"x86HasPOPCNT", varTag, 15},
        {"x86HasSSE41", varTag, 15},
+       {"x86HasFMA", varTag, 15},
        {"arm64HasATOMICS", varTag, 15},
 }
 
index 3e9055b2accfd32a556cb523a10b7ec63c6c01c2..d9eaa0b7e5701125c9171df8e58ab258c0f64f1a 100644 (file)
@@ -241,4 +241,5 @@ func checkptrArithmetic(unsafe.Pointer, []unsafe.Pointer)
 // architecture variants
 var x86HasPOPCNT bool
 var x86HasSSE41 bool
+var x86HasFMA bool
 var arm64HasATOMICS bool
index f6ad3752a0ad1da5c52a1ba0bb2222c1f2da3613..bd10ca047fcb7ca2260f302a7420575eb8554175 100644 (file)
@@ -311,6 +311,7 @@ var (
        racewriterange,
        x86HasPOPCNT,
        x86HasSSE41,
+       x86HasFMA,
        arm64HasATOMICS,
        typedmemclr,
        typedmemmove,
index 0b76ad728cdb3b1f923bf90d3fe14a30cf4f320a..b65882e032b047cc2e358e8f0b8b9b10376083a9 100644 (file)
@@ -91,6 +91,7 @@ func initssaconfig() {
        racewriterange = sysfunc("racewriterange")
        x86HasPOPCNT = sysvar("x86HasPOPCNT")       // bool
        x86HasSSE41 = sysvar("x86HasSSE41")         // bool
+       x86HasFMA = sysvar("x86HasFMA")             // bool
        arm64HasATOMICS = sysvar("arm64HasATOMICS") // bool
        typedmemclr = sysfunc("typedmemclr")
        typedmemmove = sysfunc("typedmemmove")
@@ -3326,6 +3327,36 @@ func init() {
                        return s.newValue3(ssa.OpFma, types.Types[TFLOAT64], args[0], args[1], args[2])
                },
                sys.ARM64, sys.PPC64, sys.S390X)
+       addF("math", "Fma",
+               func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasFMA, s.sb)
+                       v := s.load(types.Types[TBOOL], addr)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // >= haswell cpus are common
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue3(ssa.OpFma, types.Types[TFLOAT64], args[0], args[1], args[2])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       a := s.call(n, callNormal)
+                       s.vars[n] = s.load(types.Types[TFLOAT64], a)
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[TFLOAT64])
+               },
+               sys.AMD64)
 
        makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
index 8ef51f95659b64aaee5cd80c4a4bfb728a0c24f6..44c9e030d42a98dfd107bc3da8a42c0cec910ed5 100644 (file)
 (Floor x)      -> (ROUNDSD [1] x)
 (Ceil x)       -> (ROUNDSD [2] x)
 (Trunc x)      -> (ROUNDSD [3] x)
+(Fma x y z) -> (VFMADD231SD z x y)
 
 // Lowering extension
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
index 3fa5cfbb963d9d5f6ee47842e06a5065f3bd51ab..5924fa497ac08b8ac9e5528cd348b16f5c5875c5 100644 (file)
@@ -147,6 +147,7 @@ func init() {
 
                fp01     = regInfo{inputs: nil, outputs: fponly}
                fp21     = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+               fp31     = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
                fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
                fpgp     = regInfo{inputs: fponly, outputs: gponly}
                gpfp     = regInfo{inputs: gponly, outputs: fponly}
@@ -478,6 +479,10 @@ func init() {
                // Any use must be preceded by a successful check of runtime.x86HasSSE41.
                {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
 
+               // VFMADD231SD only exists on platforms with the FMA3 instruction set.
+               // Any use must be preceded by a successful check of runtime.support_fma.
+               {name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"},
+
                {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
                {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
                // Note: SBBW and SBBB are subsumed by SBBL
index 7f9fb4e3ef6b212c851c5eab922e7053f46763c8..59740da9a4c520a29579b72b4967b3d9f92b18cb 100644 (file)
@@ -743,6 +743,7 @@ const (
        OpAMD64POPCNTL
        OpAMD64SQRTSD
        OpAMD64ROUNDSD
+       OpAMD64VFMADD231SD
        OpAMD64SBBQcarrymask
        OpAMD64SBBLcarrymask
        OpAMD64SETEQ
@@ -9625,6 +9626,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VFMADD231SD",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVFMADD231SD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                               {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                               {2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+               },
+       },
        {
                name:   "SBBQcarrymask",
                argLen: 1,
index 386086f4b02b7b289a268735a9a360091a33aafc..845e581364edd90294bf6c32e9e4bb8e40399cc7 100644 (file)
@@ -768,6 +768,8 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpEqPtr_0(v)
        case OpFloor:
                return rewriteValueAMD64_OpFloor_0(v)
+       case OpFma:
+               return rewriteValueAMD64_OpFma_0(v)
        case OpGeq16:
                return rewriteValueAMD64_OpGeq16_0(v)
        case OpGeq16U:
@@ -52331,6 +52333,21 @@ func rewriteValueAMD64_OpFloor_0(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpFma_0(v *Value) bool {
+       // match: (Fma x y z)
+       // cond:
+       // result: (VFMADD231SD z x y)
+       for {
+               z := v.Args[2]
+               x := v.Args[0]
+               y := v.Args[1]
+               v.reset(OpAMD64VFMADD231SD)
+               v.AddArg(z)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+}
 func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
        b := v.Block
        // match: (Geq16 x y)
index 1565afb93a5573d37bb4b7d3456514a7519d9dbd..3e859a3516180c3e9afd50629e30c6f3842919bb 100644 (file)
@@ -23,6 +23,7 @@ var (
        // TODO: deprecate these; use internal/cpu directly.
        x86HasPOPCNT bool
        x86HasSSE41  bool
+       x86HasFMA    bool
 
        arm64HasATOMICS bool
 )
index d7f55b6c647e2e279542736b5b1816355afbb52e..c419dee7710846109b0560b367d555e882627d41 100644 (file)
@@ -514,6 +514,7 @@ func cpuinit() {
        // to guard execution of instructions that can not be assumed to be always supported.
        x86HasPOPCNT = cpu.X86.HasPOPCNT
        x86HasSSE41 = cpu.X86.HasSSE41
+       x86HasFMA = cpu.X86.HasFMA
 
        arm64HasATOMICS = cpu.ARM64.HasATOMICS
 }
index 427f305c122366caa53cfd3675326f9d76f4fc14..c942085480598d9e6da9cc0b99ae5bdc1e88cca6 100644 (file)
@@ -108,6 +108,7 @@ func copysign(a, b, c float64) {
 }
 
 func fma(x, y, z float64) float64 {
+       // amd64:"VFMADD231SD"
        // arm64:"FMADDD"
        // s390x:"FMADD"
        // ppc64:"FMADD"