From 6e703ae7093b8921ce8e64a08e600d94ea1f9f28 Mon Sep 17 00:00:00 2001
From: Ilya Tocar <ilya.tocar@intel.com>
Date: Thu, 1 Sep 2016 21:04:08 +0300
Subject: [PATCH] math: fix sqrt regression on AMD64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

1.7 introduced a significant regression compared to 1.6:

SqrtIndirect-4  2.32ns Â± 0%  7.86ns Â± 0%  +238.79%        (p=0.000 n=20+18)

This is caused by sqrtsd preserving upper part of destination register.
Which introduces dependency on previous  value of X0.
In 1.6 benchmark loop didn't use X0 immediately after call:

callq  *%rbx
movsd  0x8(%rsp),%xmm2
movsd  0x20(%rsp),%xmm1
addsd  %xmm2,%xmm1
mov    0x18(%rsp),%rax
inc    %rax
jmp    loop

In 1.7 however xmm0 is used just after call:

callq  *%rbx
mov    0x10(%rsp),%rcx
lea    0x1(%rcx),%rax
movsd  0x8(%rsp),%xmm0
movsd  0x18(%rsp),%xmm1

I've  verified that this is caused by dependency, by inserting
XORPS X0,X0 in the beginning of math.Sqrt, which puts performance back on 1.6 level.

Splitting SQRTSD mem,reg into:
MOVSD mem,reg
SQRTSD reg,reg

Removes dependency, because MOVSD (load version)
doesn't need to preserve upper part of a register.
And reg,reg operation is solved by renamer in CPU.

As a result of this change regression is gone:
SqrtIndirect-4  7.86ns Â± 0%  2.33ns Â± 0%  -70.36%  (p=0.000 n=18+17)

This also removes old Sqrt benchmarks, in favor of benchmarks measuring latency.
Only SqrtIndirect is kept, to show impact of this patch.

Change-Id: Ic7eebe8866445adff5bc38192fa8d64c9a6b8872
Reviewed-on: https://go-review.googlesource.com/28392
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/math/all_test.go  | 25 +++++++++++++++++--------
 src/math/sqrt_amd64.s |  7 ++++---
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/math/all_test.go b/src/math/all_test.go
index d9ea1fdb51..7d604b3e8b 100644
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -3000,27 +3000,36 @@ func BenchmarkSinh(b *testing.B) {
 
 var Global float64
 
-func BenchmarkSqrt(b *testing.B) {
+func BenchmarkSqrtIndirect(b *testing.B) {
 	x, y := 0.0, 10.0
+	f := Sqrt
 	for i := 0; i < b.N; i++ {
-		x += Sqrt(y)
+		x += f(y)
 	}
 	Global = x
 }
 
-func BenchmarkSqrtIndirect(b *testing.B) {
-	x, y := 0.0, 10.0
+func BenchmarkSqrtLatency(b *testing.B) {
+	x := 10.0
+	for i := 0; i < b.N; i++ {
+		x = Sqrt(x)
+	}
+	Global = x
+}
+
+func BenchmarkSqrtIndirectLatency(b *testing.B) {
+	x := 10.0
 	f := Sqrt
 	for i := 0; i < b.N; i++ {
-		x += f(y)
+		x = f(x)
 	}
 	Global = x
 }
 
-func BenchmarkSqrtGo(b *testing.B) {
-	x, y := 0.0, 10.0
+func BenchmarkSqrtGoLatency(b *testing.B) {
+	x := 10.0
 	for i := 0; i < b.N; i++ {
-		x += SqrtGo(y)
+		x = SqrtGo(x)
 	}
 	Global = x
 }
diff --git a/src/math/sqrt_amd64.s b/src/math/sqrt_amd64.s
index f8d825daab..d72000fccb 100644
--- a/src/math/sqrt_amd64.s
+++ b/src/math/sqrt_amd64.s
@@ -5,7 +5,8 @@
 #include "textflag.h"
 
 // func Sqrt(x float64) float64
-TEXT Â·Sqrt(SB),NOSPLIT,$0
-	SQRTSD x+0(FP), X0
-	MOVSD X0, ret+8(FP)
+TEXT Â·Sqrt(SB), NOSPLIT, $0
+	MOVSD  x+0(FP), X0
+	SQRTSD X0, X1
+	MOVSD  X1, ret+8(FP)
 	RET
-- 
2.52.0