From b597e1ed543a309926a3c3a94518a135844470ce Mon Sep 17 00:00:00 2001
From: Ilya Tocar <ilya.tocar@intel.com>
Date: Fri, 30 Oct 2015 17:54:39 +0300
Subject: [PATCH] runtime: speed up memclr with avx2 on amd64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Results are a bit noisy, but show good improvement (haswell)

name            old time/op    new time/op     delta
Memclr5-48        6.06ns Â± 8%     5.65ns Â± 8%    -6.81%  (p=0.000 n=20+20)
Memclr16-48       5.75ns Â± 6%     5.71ns Â± 6%      ~     (p=0.545 n=20+19)
Memclr64-48       6.54ns Â± 5%     6.14ns Â± 9%    -6.12%  (p=0.000 n=18+20)
Memclr256-48      10.1ns Â±12%      9.9ns Â±14%      ~     (p=0.285 n=20+19)
Memclr4096-48      104ns Â± 8%       57ns Â±15%   -44.98%  (p=0.000 n=20+20)
Memclr65536-48    2.45Âµs Â± 5%     2.43Âµs Â± 8%      ~     (p=0.665 n=16+20)
Memclr1M-48       58.7Âµs Â±13%     56.4Âµs Â±11%    -3.92%  (p=0.033 n=20+19)
Memclr4M-48        233Âµs Â± 9%      234Âµs Â± 9%      ~     (p=0.728 n=20+19)
Memclr8M-48        469Âµs Â±11%      472Âµs Â±16%      ~     (p=0.947 n=20+20)
Memclr16M-48       947Âµs Â±10%      916Âµs Â±10%      ~     (p=0.050 n=20+19)
Memclr64M-48      10.9ms Â±10%      4.5ms Â± 9%   -58.43%  (p=0.000 n=20+20)
GoMemclr5-48      3.80ns Â±13%     3.38ns Â± 6%   -11.02%  (p=0.000 n=20+20)
GoMemclr16-48     3.34ns Â±15%     3.40ns Â± 9%      ~     (p=0.351 n=20+20)
GoMemclr64-48     4.10ns Â±15%     4.04ns Â±10%      ~     (p=1.000 n=20+19)
GoMemclr256-48    7.75ns Â±20%     7.88ns Â± 9%      ~     (p=0.227 n=20+19)

name            old speed      new speed       delta
Memclr5-48       826MB/s Â± 7%    886MB/s Â± 8%    +7.32%  (p=0.000 n=20+20)
Memclr16-48     2.78GB/s Â± 5%   2.81GB/s Â± 6%      ~     (p=0.550 n=20+19)
Memclr64-48     9.79GB/s Â± 5%  10.44GB/s Â±10%    +6.64%  (p=0.000 n=18+20)
Memclr256-48    25.4GB/s Â±14%   25.6GB/s Â±12%      ~     (p=0.647 n=20+19)
Memclr4096-48   39.4GB/s Â± 8%   72.0GB/s Â±13%   +82.81%  (p=0.000 n=20+20)
Memclr65536-48  26.6GB/s Â± 6%   27.0GB/s Â± 9%      ~     (p=0.517 n=17+20)
Memclr1M-48     17.9GB/s Â±12%   18.5GB/s Â±11%      ~     (p=0.068 n=20+20)
Memclr4M-48     18.0GB/s Â± 9%   17.8GB/s Â±14%      ~     (p=0.547 n=20+20)
Memclr8M-48     17.9GB/s Â±10%   17.8GB/s Â±14%      ~     (p=0.947 n=20+20)
Memclr16M-48    17.8GB/s Â± 9%   18.4GB/s Â± 9%      ~     (p=0.050 n=20+19)
Memclr64M-48    6.19GB/s Â±10%  14.87GB/s Â± 9%  +140.11%  (p=0.000 n=20+20)
GoMemclr5-48    1.31GB/s Â±10%   1.48GB/s Â± 6%   +13.06%  (p=0.000 n=19+20)
GoMemclr16-48   4.81GB/s Â±14%   4.71GB/s Â± 8%      ~     (p=0.341 n=20+20)
GoMemclr64-48   15.7GB/s Â±13%   15.8GB/s Â±11%      ~     (p=0.967 n=20+19)

Change-Id: I393f3f20e2f31538d1b1dd70d6e5c201c106a095
Reviewed-on: https://go-review.googlesource.com/16773
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Klaus Post <klauspost@gmail.com>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/cmd/internal/obj/x86/a.out.go  |  1 +
 src/cmd/internal/obj/x86/anames.go |  1 +
 src/cmd/internal/obj/x86/asm6.go   |  1 +
 src/runtime/memclr_amd64.s         | 55 +++++++++++++++++++++++++++++-
 src/runtime/memmove_test.go        |  5 +++
 5 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go
index 0b5d8eb976..73abe3b705 100644
--- a/src/cmd/internal/obj/x86/a.out.go
+++ b/src/cmd/internal/obj/x86/a.out.go
@@ -747,6 +747,7 @@ const (
 	AMOVNTHD
 	AMOVHDA
 	AVPCMPEQB
+	AVPXOR
 	AVPMOVMSKB
 	AVPAND
 	AVPTEST
diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go
index f545baf994..d94d7eaff1 100644
--- a/src/cmd/internal/obj/x86/anames.go
+++ b/src/cmd/internal/obj/x86/anames.go
@@ -688,6 +688,7 @@ var Anames = []string{
 	"MOVNTHD",
 	"MOVHDA",
 	"VPCMPEQB",
+	"VPXOR",
 	"VPMOVMSKB",
 	"VPAND",
 	"VPTEST",
diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go
index ed728aa727..19aee0df44 100644
--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@@ -1514,6 +1514,7 @@ var optab =
 	{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
 	{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
 	{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
+	{AVPXOR, yxm_xm_xm, Pvex1, [23]uint8{0xef, 0xef}},
 	{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
 	{AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
 	{AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}},
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 3e2c4b241a..5e78037df6 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -36,8 +36,10 @@ tail:
 	JBE	_65through128
 	CMPQ	BX, $256
 	JBE	_129through256
+	CMPB	runtimeÂ·support_avx2(SB), $1
+	JE loop_preheader_avx2
 	// TODO: use branch table and BSR to make this just a single dispatch
-	// TODO: for really big clears, use MOVNTDQ.
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
 
 loop:
 	MOVOU	X0, 0(DI)
@@ -62,6 +64,57 @@ loop:
 	JAE	loop
 	JMP	tail
 
+loop_preheader_avx2:
+	VPXOR X0, X0, X0
+	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
+	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
+	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
+	CMPQ    BX, $0x2000000
+	JAE     loop_preheader_avx2_huge
+loop_avx2:
+	MOVHDU	X0, 0(DI)
+	MOVHDU	X0, 32(DI)
+	MOVHDU	X0, 64(DI)
+	MOVHDU	X0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2
+	MOVHDU  X0, -32(DI)(BX*1)
+	MOVHDU  X0, -64(DI)(BX*1)
+	MOVHDU  X0, -96(DI)(BX*1)
+	MOVHDU  X0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+loop_preheader_avx2_huge:
+	// Align to 32 byte boundary
+	MOVHDU  X0, 0(DI)
+	MOVQ	DI, SI
+	ADDQ	$32, DI
+	ANDQ	$~31, DI
+	SUBQ	DI, SI
+	ADDQ	SI, BX
+loop_avx2_huge:
+	MOVNTHD	X0, 0(DI)
+	MOVNTHD	X0, 32(DI)
+	MOVNTHD	X0, 64(DI)
+	MOVNTHD	X0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2_huge
+	// In the desciption of MOVNTDQ in [1]
+	// "... fencing operation implemented with the SFENCE or MFENCE instruction
+	// should be used in conjunction with MOVNTDQ instructions..."
+	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+	SFENCE
+	MOVHDU  X0, -32(DI)(BX*1)
+	MOVHDU  X0, -64(DI)(BX*1)
+	MOVHDU  X0, -96(DI)(BX*1)
+	MOVHDU  X0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+
 _1or2:
 	MOVB	AX, (DI)
 	MOVB	AX, -1(DI)(BX*1)
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index d5a2ad8372..7f9d3f1427 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -196,6 +196,11 @@ func BenchmarkMemclr64(b *testing.B)    { bmMemclr(b, 64) }
 func BenchmarkMemclr256(b *testing.B)   { bmMemclr(b, 256) }
 func BenchmarkMemclr4096(b *testing.B)  { bmMemclr(b, 4096) }
 func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
+func BenchmarkMemclr1M(b *testing.B)    { bmMemclr(b, 1<<20) }
+func BenchmarkMemclr4M(b *testing.B)    { bmMemclr(b, 4<<20) }
+func BenchmarkMemclr8M(b *testing.B)    { bmMemclr(b, 8<<20) }
+func BenchmarkMemclr16M(b *testing.B)   { bmMemclr(b, 16<<20) }
+func BenchmarkMemclr64M(b *testing.B)   { bmMemclr(b, 64<<20) }
 
 func bmGoMemclr(b *testing.B, n int) {
 	x := make([]byte, n)
-- 
2.52.0