1.7 introduced a significant regression compared to 1.6:
SqrtIndirect-4 2.32ns ± 0% 7.86ns ± 0% +238.79% (p=0.000 n=20+18)
This is caused by sqrtsd preserving upper part of destination register.
Which introduces dependency on previous value of X0.
In 1.6 benchmark loop didn't use X0 immediately after call:
callq *%rbx
movsd 0x8(%rsp),%xmm2
movsd 0x20(%rsp),%xmm1
addsd %xmm2,%xmm1
mov 0x18(%rsp),%rax
inc %rax
jmp loop
In 1.7 however xmm0 is used just after call:
callq *%rbx
mov 0x10(%rsp),%rcx
lea 0x1(%rcx),%rax
movsd 0x8(%rsp),%xmm0
movsd 0x18(%rsp),%xmm1
I've verified that this is caused by dependency, by inserting
XORPS X0,X0 in the beginning of math.Sqrt, which puts performance back on 1.6 level.
Splitting SQRTSD mem,reg into:
MOVSD mem,reg
SQRTSD reg,reg
Removes dependency, because MOVSD (load version)
doesn't need to preserve upper part of a register.
And reg,reg operation is solved by renamer in CPU.
As a result of this change regression is gone:
SqrtIndirect-4 7.86ns ± 0% 2.33ns ± 0% -70.36% (p=0.000 n=18+17)
This also removes old Sqrt benchmarks, in favor of benchmarks measuring latency.
Only SqrtIndirect is kept, to show impact of this patch.
Change-Id: Ic7eebe8866445adff5bc38192fa8d64c9a6b8872
Reviewed-on: https://go-review.googlesource.com/28392
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Keith Randall <khr@golang.org>
var Global float64
-func BenchmarkSqrt(b *testing.B) {
+func BenchmarkSqrtIndirect(b *testing.B) {
x, y := 0.0, 10.0
+ f := Sqrt
for i := 0; i < b.N; i++ {
- x += Sqrt(y)
+ x += f(y)
}
Global = x
}
-func BenchmarkSqrtIndirect(b *testing.B) {
- x, y := 0.0, 10.0
+func BenchmarkSqrtLatency(b *testing.B) {
+ x := 10.0
+ for i := 0; i < b.N; i++ {
+ x = Sqrt(x)
+ }
+ Global = x
+}
+
+func BenchmarkSqrtIndirectLatency(b *testing.B) {
+ x := 10.0
f := Sqrt
for i := 0; i < b.N; i++ {
- x += f(y)
+ x = f(x)
}
Global = x
}
-func BenchmarkSqrtGo(b *testing.B) {
- x, y := 0.0, 10.0
+func BenchmarkSqrtGoLatency(b *testing.B) {
+ x := 10.0
for i := 0; i < b.N; i++ {
- x += SqrtGo(y)
+ x = SqrtGo(x)
}
Global = x
}
#include "textflag.h"
// func Sqrt(x float64) float64
-TEXT ·Sqrt(SB),NOSPLIT,$0
- SQRTSD x+0(FP), X0
- MOVSD X0, ret+8(FP)
+TEXT ·Sqrt(SB), NOSPLIT, $0
+ MOVSD x+0(FP), X0
+ SQRTSD X0, X1
+ MOVSD X1, ret+8(FP)
RET