From: Shenghou Ma <minux.ma@gmail.com>
Date: Tue, 7 Aug 2012 15:45:50 +0000 (+0800)
Subject: runtime: inline several float64 routines to speed up complex128 division
X-Git-Tag: go1.1rc2~2681
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0157c72d133471631c13419f61117b75dcd7c255;p=gostls13.git

runtime: inline several float64 routines to speed up complex128 division
Depends on CL 6197045.

Result obtained on Core i7 620M, Darwin/amd64:
benchmark                       old ns/op    new ns/op    delta
BenchmarkComplex128DivNormal           57           28  -50.78%
BenchmarkComplex128DivNisNaN           49           15  -68.90%
BenchmarkComplex128DivDisNaN           49           15  -67.88%
BenchmarkComplex128DivNisInf           40           12  -68.50%
BenchmarkComplex128DivDisInf           33           13  -61.06%

Result obtained on Core i7 620M, Darwin/386:
benchmark                       old ns/op    new ns/op    delta
BenchmarkComplex128DivNormal           89           50  -44.05%
BenchmarkComplex128DivNisNaN          307          802  +161.24%
BenchmarkComplex128DivDisNaN          309          788  +155.02%
BenchmarkComplex128DivNisInf          278          237  -14.75%
BenchmarkComplex128DivDisInf           46           22  -52.46%

Result obtained on 700MHz OMAP4460, Linux/ARM:
benchmark                       old ns/op    new ns/op    delta
BenchmarkComplex128DivNormal         1557          465  -70.13%
BenchmarkComplex128DivNisNaN         1443          220  -84.75%
BenchmarkComplex128DivDisNaN         1481          218  -85.28%
BenchmarkComplex128DivNisInf          952          216  -77.31%
BenchmarkComplex128DivDisInf          861          231  -73.17%

The 386 version has a performance regression, but as we have
decided to use SSE2 instead of x87 FPU for 386 too (issue 3912),
I won't address this issue.

R=dsymonds, mchaten, iant, dave, mtj, rsc, r
CC=golang-dev
https://golang.org/cl/6024045
---

diff --git a/src/pkg/runtime/complex.c b/src/pkg/runtime/complex.c
index eeb9439405..395e70fe34 100644
--- a/src/pkg/runtime/complex.c
+++ b/src/pkg/runtime/complex.c
@@ -13,28 +13,30 @@ runtimeÂ·complex128div(Complex128 n, Complex128 d, Complex128 q)
 	float64 a, b, ratio, denom;
 
 	// Special cases as in C99.
-	ninf = runtimeÂ·isInf(n.real, 0) || runtimeÂ·isInf(n.imag, 0);
-	dinf = runtimeÂ·isInf(d.real, 0) || runtimeÂ·isInf(d.imag, 0);
+	ninf = n.real == runtimeÂ·posinf || n.real == runtimeÂ·neginf ||
+	       n.imag == runtimeÂ·posinf || n.imag == runtimeÂ·neginf;
+	dinf = d.real == runtimeÂ·posinf || d.real == runtimeÂ·neginf ||
+	       d.imag == runtimeÂ·posinf || d.imag == runtimeÂ·neginf;
 
-	nnan = !ninf && (runtimeÂ·isNaN(n.real) || runtimeÂ·isNaN(n.imag));
-	dnan = !dinf && (runtimeÂ·isNaN(d.real) || runtimeÂ·isNaN(d.imag));
+	nnan = !ninf && (ISNAN(n.real) || ISNAN(n.imag));
+	dnan = !dinf && (ISNAN(d.real) || ISNAN(d.imag));
 
 	if(nnan || dnan) {
-		q.real = runtimeÂ·NaN();
-		q.imag = runtimeÂ·NaN();
-	} else if(ninf && !dinf && !dnan) {
-		q.real = runtimeÂ·Inf(0);
-		q.imag = runtimeÂ·Inf(0);
-	} else if(!ninf && !nnan && dinf) {
+		q.real = runtimeÂ·nan;
+		q.imag = runtimeÂ·nan;
+	} else if(ninf && !dinf) {
+		q.real = runtimeÂ·posinf;
+		q.imag = runtimeÂ·posinf;
+	} else if(!ninf && dinf) {
 		q.real = 0;
 		q.imag = 0;
 	} else if(d.real == 0 && d.imag == 0) {
 		if(n.real == 0 && n.imag == 0) {
-			q.real = runtimeÂ·NaN();
-			q.imag = runtimeÂ·NaN();
+			q.real = runtimeÂ·nan;
+			q.imag = runtimeÂ·nan;
 		} else {
-			q.real = runtimeÂ·Inf(0);
-			q.imag = runtimeÂ·Inf(0);
+			q.real = runtimeÂ·posinf;
+			q.imag = runtimeÂ·posinf;
 		}
 	} else {
 		// Standard complex arithmetic, factored to avoid unnecessary overflow.
diff --git a/src/pkg/runtime/float.c b/src/pkg/runtime/float.c
index 4d9f125977..42082e4347 100644
--- a/src/pkg/runtime/float.c
+++ b/src/pkg/runtime/float.c
@@ -4,170 +4,7 @@
 
 #include "runtime.h"
 
-static	uint64	uvnan		= 0x7FF8000000000001ULL;
-static	uint64	uvinf		= 0x7FF0000000000000ULL;
-static	uint64	uvneginf	= 0xFFF0000000000000ULL;
-
-uint32
-runtimeÂ·float32tobits(float32 f)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float32 f;
-		uint32 i;
-	} u;
-	u.f = f;
-	return u.i;
-}
-
-uint64
-runtimeÂ·float64tobits(float64 f)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float64 f;
-		uint64 i;
-	} u;
-	u.f = f;
-	return u.i;
-}
-
-float64
-runtimeÂ·float64frombits(uint64 i)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float64 f;
-		uint64 i;
-	} u;
-	u.i = i;
-	return u.f;
-}
-
-float32
-runtimeÂ·float32frombits(uint32 i)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float32 f;
-		uint32 i;
-	} u;
-	u.i = i;
-	return u.f;
-}
-
-bool
-runtimeÂ·isInf(float64 f, int32 sign)
-{
-	uint64 x;
-
-	x = runtimeÂ·float64tobits(f);
-	if(sign == 0)
-		return x == uvinf || x == uvneginf;
-	if(sign > 0)
-		return x == uvinf;
-	return x == uvneginf;
-}
-
-float64
-runtimeÂ·NaN(void)
-{
-	return runtimeÂ·float64frombits(uvnan);
-}
-
-bool
-runtimeÂ·isNaN(float64 f)
-{
-	uint64 x;
-
-	x = runtimeÂ·float64tobits(f);
-	return ((uint32)(x>>52) & 0x7FF) == 0x7FF && !runtimeÂ·isInf(f, 0);
-}
-
-float64
-runtimeÂ·Inf(int32 sign)
-{
-	if(sign >= 0)
-		return runtimeÂ·float64frombits(uvinf);
-	else
-		return runtimeÂ·float64frombits(uvneginf);
-}
-
-enum
-{
-	MASK	= 0x7ffL,
-	SHIFT	= 64-11-1,
-	BIAS	= 1022L,
-};
-
-float64
-runtimeÂ·frexp(float64 d, int32 *ep)
-{
-	uint64 x;
-
-	if(d == 0) {
-		*ep = 0;
-		return 0;
-	}
-	x = runtimeÂ·float64tobits(d);
-	*ep = (int32)((x >> SHIFT) & MASK) - BIAS;
-	x &= ~((uint64)MASK << SHIFT);
-	x |= (uint64)BIAS << SHIFT;
-	return runtimeÂ·float64frombits(x);
-}
-
-float64
-runtimeÂ·ldexp(float64 d, int32 e)
-{
-	uint64 x;
-
-	if(d == 0)
-		return 0;
-	x = runtimeÂ·float64tobits(d);
-	e += (int32)(x >> SHIFT) & MASK;
-	if(e <= 0)
-		return 0;	/* underflow */
-	if(e >= MASK){		/* overflow */
-		if(d < 0)
-			return runtimeÂ·Inf(-1);
-		return runtimeÂ·Inf(1);
-	}
-	x &= ~((uint64)MASK << SHIFT);
-	x |= (uint64)e << SHIFT;
-	return runtimeÂ·float64frombits(x);
-}
-
-float64
-runtimeÂ·modf(float64 d, float64 *ip)
-{
-	float64 dd;
-	uint64 x;
-	int32 e;
-
-	if(d < 1) {
-		if(d < 0) {
-			d = runtimeÂ·modf(-d, ip);
-			*ip = -*ip;
-			return -d;
-		}
-		*ip = 0;
-		return d;
-	}
-
-	x = runtimeÂ·float64tobits(d);
-	e = (int32)((x >> SHIFT) & MASK) - BIAS;
-
-	/*
-	 * Keep the top 11+e bits; clear the rest.
-	 */
-	if(e <= 64-11)
-		x &= ~(((uint64)1 << (64LL-11LL-e))-1);
-	dd = runtimeÂ·float64frombits(x);
-	*ip = dd;
-	return d - dd;
-}
-
+// used as float64 via runtimeÂ· names
+uint64	Â·nan		= 0x7FF8000000000001ULL;
+uint64	Â·posinf	= 0x7FF0000000000000ULL;
+uint64	Â·neginf	= 0xFFF0000000000000ULL;
diff --git a/src/pkg/runtime/print.c b/src/pkg/runtime/print.c
index b41e28b37a..fe21f1691a 100644
--- a/src/pkg/runtime/print.c
+++ b/src/pkg/runtime/print.c
@@ -209,15 +209,15 @@ runtimeÂ·printfloat(float64 v)
 	int32 e, s, i, n;
 	float64 h;
 
-	if(runtimeÂ·isNaN(v)) {
+	if(ISNAN(v)) {
 		gwrite("NaN", 3);
 		return;
 	}
-	if(runtimeÂ·isInf(v, 1)) {
+	if(v == runtimeÂ·posinf) {
 		gwrite("+Inf", 4);
 		return;
 	}
-	if(runtimeÂ·isInf(v, -1)) {
+	if(v == runtimeÂ·neginf) {
 		gwrite("-Inf", 4);
 		return;
 	}
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index dfdb3663c9..c8df87e5e8 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -815,3 +815,12 @@ uintptr	runtimeÂ·memlimit(void);
 // is forced to deliver the signal to a thread that's actually running.
 // This is a no-op on other systems.
 void	runtimeÂ·setprof(bool);
+
+// float.c
+extern float64 runtimeÂ·nan;
+extern float64 runtimeÂ·posinf;
+extern float64 runtimeÂ·neginf;
+extern uint64 Â·nan;
+extern uint64 Â·posinf;
+extern uint64 Â·neginf;
+#define ISNAN(f) ((f) != (f))