From 6e703ae7093b8921ce8e64a08e600d94ea1f9f28 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 1 Sep 2016 21:04:08 +0300 Subject: [PATCH] math: fix sqrt regression on AMD64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 1.7 introduced a significant regression compared to 1.6: SqrtIndirect-4 2.32ns ± 0% 7.86ns ± 0% +238.79% (p=0.000 n=20+18) This is caused by sqrtsd preserving upper part of destination register. Which introduces dependency on previous value of X0. In 1.6 benchmark loop didn't use X0 immediately after call: callq *%rbx movsd 0x8(%rsp),%xmm2 movsd 0x20(%rsp),%xmm1 addsd %xmm2,%xmm1 mov 0x18(%rsp),%rax inc %rax jmp loop In 1.7 however xmm0 is used just after call: callq *%rbx mov 0x10(%rsp),%rcx lea 0x1(%rcx),%rax movsd 0x8(%rsp),%xmm0 movsd 0x18(%rsp),%xmm1 I've verified that this is caused by dependency, by inserting XORPS X0,X0 in the beginning of math.Sqrt, which puts performance back on 1.6 level. Splitting SQRTSD mem,reg into: MOVSD mem,reg SQRTSD reg,reg Removes dependency, because MOVSD (load version) doesn't need to preserve upper part of a register. And reg,reg operation is solved by renamer in CPU. As a result of this change regression is gone: SqrtIndirect-4 7.86ns ± 0% 2.33ns ± 0% -70.36% (p=0.000 n=18+17) This also removes old Sqrt benchmarks, in favor of benchmarks measuring latency. Only SqrtIndirect is kept, to show impact of this patch. Change-Id: Ic7eebe8866445adff5bc38192fa8d64c9a6b8872 Reviewed-on: https://go-review.googlesource.com/28392 Run-TryBot: Ilya Tocar Reviewed-by: Keith Randall --- src/math/all_test.go | 25 +++++++++++++++++-------- src/math/sqrt_amd64.s | 7 ++++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/math/all_test.go b/src/math/all_test.go index d9ea1fdb51..7d604b3e8b 100644 --- a/src/math/all_test.go +++ b/src/math/all_test.go @@ -3000,27 +3000,36 @@ func BenchmarkSinh(b *testing.B) { var Global float64 -func BenchmarkSqrt(b *testing.B) { +func BenchmarkSqrtIndirect(b *testing.B) { x, y := 0.0, 10.0 + f := Sqrt for i := 0; i < b.N; i++ { - x += Sqrt(y) + x += f(y) } Global = x } -func BenchmarkSqrtIndirect(b *testing.B) { - x, y := 0.0, 10.0 +func BenchmarkSqrtLatency(b *testing.B) { + x := 10.0 + for i := 0; i < b.N; i++ { + x = Sqrt(x) + } + Global = x +} + +func BenchmarkSqrtIndirectLatency(b *testing.B) { + x := 10.0 f := Sqrt for i := 0; i < b.N; i++ { - x += f(y) + x = f(x) } Global = x } -func BenchmarkSqrtGo(b *testing.B) { - x, y := 0.0, 10.0 +func BenchmarkSqrtGoLatency(b *testing.B) { + x := 10.0 for i := 0; i < b.N; i++ { - x += SqrtGo(y) + x = SqrtGo(x) } Global = x } diff --git a/src/math/sqrt_amd64.s b/src/math/sqrt_amd64.s index f8d825daab..d72000fccb 100644 --- a/src/math/sqrt_amd64.s +++ b/src/math/sqrt_amd64.s @@ -5,7 +5,8 @@ #include "textflag.h" // func Sqrt(x float64) float64 -TEXT ·Sqrt(SB),NOSPLIT,$0 - SQRTSD x+0(FP), X0 - MOVSD X0, ret+8(FP) +TEXT ·Sqrt(SB), NOSPLIT, $0 + MOVSD x+0(FP), X0 + SQRTSD X0, X1 + MOVSD X1, ret+8(FP) RET -- 2.48.1