]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/gc: use hardware instruction for math.Sqrt (amd64/arm)
authorRuss Cox <rsc@golang.org>
Wed, 1 Apr 2015 20:02:34 +0000 (16:02 -0400)
committerRuss Cox <rsc@golang.org>
Fri, 3 Apr 2015 16:13:36 +0000 (16:13 +0000)
I first prototyped this change in Sept 2011, and I discarded it
because it made no difference in the obvious benchmark loop.
It still makes no difference in the obvious benchmark loop,
but in a less obvious one, doing some extra computation
around the calls to Sqrt, not making the call does have a
significant effect.

benchmark                 old ns/op     new ns/op     delta
BenchmarkSqrt             4.56          4.57          +0.22%
BenchmarkSqrtIndirect     4.56          4.56          +0.00%
BenchmarkSqrtGo           69.4          69.4          +0.00%
BenchmarkSqrtPrime        4417          3647          -17.43%

This is a warmup for using hardware expansions for some
calls to 1-line assembly routines in the runtime (for example getg).

Change-Id: Ie66be23f8c09d0f7dc4ddd7ca8a93cfce28f55a4
Reviewed-on: https://go-review.googlesource.com/8356
Reviewed-by: Rob Pike <r@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
src/cmd/5g/gsubr.go
src/cmd/5g/peep.go
src/cmd/5g/prog.go
src/cmd/6g/gsubr.go
src/cmd/6g/prog.go
src/cmd/internal/gc/cgen.go
src/cmd/internal/gc/gen.go
src/cmd/internal/gc/syntax.go
src/cmd/internal/gc/walk.go
src/math/all_test.go
src/math/sqrt.go

index 0d22f74c9a1b625c186bfd1efcea7a0300ecba2e..fe4ed8d1f212567de91f6a51dd126068558f8584 100644 (file)
@@ -1055,6 +1055,9 @@ func optoas(op int, t *gc.Type) int {
 
        case gc.ODIV<<16 | gc.TFLOAT64:
                a = arm.ADIVD
+
+       case gc.OSQRT<<16 | gc.TFLOAT64:
+               a = arm.ASQRTD
        }
 
        return a
index 5305e4b7f6bb2f3d15ba26f9b8de064a4e83287e..9ec3be2eecc022f2a71a57c8485f94d7420370f0 100644 (file)
@@ -1101,6 +1101,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
                return 0
 
        case obj.ANOP, /* read,, write */
+               arm.ASQRTD,
                arm.AMOVW,
                arm.AMOVF,
                arm.AMOVD,
index bfb703e56fa0a2c39d347601fed8cf1d4437f7fe..c472cdf04282a01cd7636bf3e115ae566e94ba6f 100644 (file)
@@ -70,16 +70,17 @@ var progtable = [arm.ALAST]obj.ProgInfo{
        arm.ATST:    {gc.SizeL | gc.LeftRead | gc.RightRead, 0, 0, 0},
 
        // Floating point.
-       arm.AADDD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.AADDF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.ACMPD: {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
-       arm.ACMPF: {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
-       arm.ADIVD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.ADIVF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.AMULD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.AMULF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.ASUBD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-       arm.ASUBF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.AADDD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.AADDF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.ACMPD:  {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
+       arm.ACMPF:  {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
+       arm.ADIVD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.ADIVF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.AMULD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.AMULF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.ASUBD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.ASUBF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+       arm.ASQRTD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 
        // Conversions.
        arm.AMOVWD: {gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv, 0, 0, 0},
index b2290af7339c8f36a72b5732d4f8cd004963296e..323ea69a989479fec0ae195f495ea6bb634f1320 100644 (file)
@@ -1131,6 +1131,9 @@ func optoas(op int, t *gc.Type) int {
 
        case gc.ODIV<<16 | gc.TFLOAT64:
                a = x86.ADIVSD
+
+       case gc.OSQRT<<16 | gc.TFLOAT64:
+               a = x86.ASQRTSD
        }
 
        return a
index 0644800257374fe2b38e1a3b709b441c3071cadd..fe9f0138518429a6bd2980d83650368af83798dc 100644 (file)
@@ -204,6 +204,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
        x86.ASHRL:     {gc.SizeL | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
        x86.ASHRQ:     {gc.SizeQ | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
        x86.ASHRW:     {gc.SizeW | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
+       x86.ASQRTSD:   {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
        x86.ASTOSB:    {gc.OK, AX | DI, DI, 0},
        x86.ASTOSL:    {gc.OK, AX | DI, DI, 0},
        x86.ASTOSQ:    {gc.OK, AX | DI, DI, 0},
index b3524c26c4829affd0923d5b6a48b1f90ee34535..d3921f7ecebf5919c4191a2923e818d14aafe20f 100644 (file)
@@ -409,6 +409,15 @@ func Cgen(n *Node, res *Node) {
                cgen_norm(n, &n1, res)
                return
 
+       case OSQRT:
+               var n1 Node
+               Regalloc(&n1, nl.Type, res)
+               Cgen(n.Left, &n1)
+               Thearch.Gins(Thearch.Optoas(OSQRT, nl.Type), &n1, &n1)
+               Thearch.Gmove(&n1, res)
+               Regfree(&n1)
+               return
+
                // symmetric binary
        case OAND,
                OOR,
index caae2f1ce15f762826a939f201bafdac9695ae7f..e0659fc8a4291d58df3d4ab3f446091aaa8810c5 100644 (file)
@@ -1002,6 +1002,9 @@ func gen(n *Node) {
        case ORETURN, ORETJMP:
                cgen_ret(n)
 
+       case OSQRT:
+               cgen_discard(n.Left)
+
        case OCHECKNIL:
                Cgen_checknil(n.Left)
 
index 8f5b85db1f169f6a5991fe81e4aa7550cb0b28b9..671a624c1d1fba2b7c1d166a3967042af9008787 100644 (file)
@@ -293,7 +293,7 @@ const (
        OREGISTER // a register, such as AX.
        OINDREG   // offset plus indirect of a register, such as 8(SP).
 
-       // 386/amd64-specific opcodes
+       // arch-specific opcodes
        OCMP    // compare: ACMP.
        ODEC    // decrement: ADEC.
        OINC    // increment: AINC.
@@ -303,6 +303,7 @@ const (
        ORROTC  // right rotate-carry: ARCR.
        ORETJMP // return to other function
        OPS     // compare parity set (for x86 NaN check)
+       OSQRT   // sqrt(float64), on systems that have hw support
 
        OEND
 )
index c10201aa2e3855b4ad8333040b5422d32e5d3274..a0a29d35ace1f5205ee168a136c5da6e5ad3185d 100644 (file)
@@ -622,6 +622,16 @@ func walkexpr(np **Node, init **NodeList) {
                walkexpr(&n.Left, init)
                walkexprlist(n.List, init)
 
+               if n.Left.Op == ONAME && n.Left.Sym.Name == "Sqrt" && n.Left.Sym.Pkg.Path == "math" {
+                       switch Thearch.Thechar {
+                       case '5', '6':
+                               n.Op = OSQRT
+                               n.Left = n.List.N
+                               n.List = nil
+                               goto ret
+                       }
+               }
+
                ll := ascompatte(int(n.Op), n, n.Isddd, getinarg(t), n.List, 0, init)
                n.List = reorder1(ll)
                goto ret
index c07ac740e34b45bedb363ddbdfd63c2f68cb7c2f..84061be2645c60030d7aa3f3bd04299f18ed43c2 100644 (file)
@@ -2977,15 +2977,56 @@ func BenchmarkSinh(b *testing.B) {
        }
 }
 
+var Global float64
+
 func BenchmarkSqrt(b *testing.B) {
+       x, y := 0.0, 10.0
+       for i := 0; i < b.N; i++ {
+               x += Sqrt(y)
+       }
+       Global = x
+}
+
+func BenchmarkSqrtIndirect(b *testing.B) {
+       x, y := 0.0, 10.0
+       f := Sqrt
        for i := 0; i < b.N; i++ {
-               Sqrt(10)
+               x += f(y)
        }
+       Global = x
 }
 
 func BenchmarkSqrtGo(b *testing.B) {
+       x, y := 0.0, 10.0
        for i := 0; i < b.N; i++ {
-               SqrtGo(10)
+               x += SqrtGo(y)
+       }
+       Global = x
+}
+
+func isPrime(i int) bool {
+       // Yes, this is a dumb way to write this code,
+       // but calling Sqrt repeatedly in this way demonstrates
+       // the benefit of using a direct SQRT instruction on systems
+       // that have one, whereas the obvious loop seems not to
+       // demonstrate such a benefit.
+       for j := 2; float64(j) <= Sqrt(float64(i)); j++ {
+               if i%j == 0 {
+                       return false
+               }
+       }
+       return true
+}
+
+func BenchmarkSqrtPrime(b *testing.B) {
+       any := false
+       for i := 0; i < b.N; i++ {
+               if isPrime(100003) {
+                       any = true
+               }
+       }
+       if any {
+               Global = 1
        }
 }
 
index fdc869992e11fe599bc96c9886cabbc6ea816880..23cf2996c2234d4bdc7fce9f351d5c18fa4a1def 100644 (file)
@@ -91,6 +91,11 @@ package math
 //     Sqrt(NaN) = NaN
 func Sqrt(x float64) float64
 
+// Note: Sqrt is implemented in assembly on some systems.
+// Others have assembly stubs that jump to func sqrt below.
+// On systems where Sqrt is a single instruction, the compiler
+// may turn a direct call into a direct use of that instruction instead.
+
 func sqrt(x float64) float64 {
        // special cases
        switch {