From b597e1ed543a309926a3c3a94518a135844470ce Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Fri, 30 Oct 2015 17:54:39 +0300 Subject: [PATCH] runtime: speed up memclr with avx2 on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Results are a bit noisy, but show good improvement (haswell) name old time/op new time/op delta Memclr5-48 6.06ns ± 8% 5.65ns ± 8% -6.81% (p=0.000 n=20+20) Memclr16-48 5.75ns ± 6% 5.71ns ± 6% ~ (p=0.545 n=20+19) Memclr64-48 6.54ns ± 5% 6.14ns ± 9% -6.12% (p=0.000 n=18+20) Memclr256-48 10.1ns ±12% 9.9ns ±14% ~ (p=0.285 n=20+19) Memclr4096-48 104ns ± 8% 57ns ±15% -44.98% (p=0.000 n=20+20) Memclr65536-48 2.45µs ± 5% 2.43µs ± 8% ~ (p=0.665 n=16+20) Memclr1M-48 58.7µs ±13% 56.4µs ±11% -3.92% (p=0.033 n=20+19) Memclr4M-48 233µs ± 9% 234µs ± 9% ~ (p=0.728 n=20+19) Memclr8M-48 469µs ±11% 472µs ±16% ~ (p=0.947 n=20+20) Memclr16M-48 947µs ±10% 916µs ±10% ~ (p=0.050 n=20+19) Memclr64M-48 10.9ms ±10% 4.5ms ± 9% -58.43% (p=0.000 n=20+20) GoMemclr5-48 3.80ns ±13% 3.38ns ± 6% -11.02% (p=0.000 n=20+20) GoMemclr16-48 3.34ns ±15% 3.40ns ± 9% ~ (p=0.351 n=20+20) GoMemclr64-48 4.10ns ±15% 4.04ns ±10% ~ (p=1.000 n=20+19) GoMemclr256-48 7.75ns ±20% 7.88ns ± 9% ~ (p=0.227 n=20+19) name old speed new speed delta Memclr5-48 826MB/s ± 7% 886MB/s ± 8% +7.32% (p=0.000 n=20+20) Memclr16-48 2.78GB/s ± 5% 2.81GB/s ± 6% ~ (p=0.550 n=20+19) Memclr64-48 9.79GB/s ± 5% 10.44GB/s ±10% +6.64% (p=0.000 n=18+20) Memclr256-48 25.4GB/s ±14% 25.6GB/s ±12% ~ (p=0.647 n=20+19) Memclr4096-48 39.4GB/s ± 8% 72.0GB/s ±13% +82.81% (p=0.000 n=20+20) Memclr65536-48 26.6GB/s ± 6% 27.0GB/s ± 9% ~ (p=0.517 n=17+20) Memclr1M-48 17.9GB/s ±12% 18.5GB/s ±11% ~ (p=0.068 n=20+20) Memclr4M-48 18.0GB/s ± 9% 17.8GB/s ±14% ~ (p=0.547 n=20+20) Memclr8M-48 17.9GB/s ±10% 17.8GB/s ±14% ~ (p=0.947 n=20+20) Memclr16M-48 17.8GB/s ± 9% 18.4GB/s ± 9% ~ (p=0.050 n=20+19) Memclr64M-48 6.19GB/s ±10% 14.87GB/s ± 9% +140.11% (p=0.000 n=20+20) GoMemclr5-48 1.31GB/s ±10% 1.48GB/s ± 6% +13.06% (p=0.000 n=19+20) GoMemclr16-48 4.81GB/s ±14% 4.71GB/s ± 8% ~ (p=0.341 n=20+20) GoMemclr64-48 15.7GB/s ±13% 15.8GB/s ±11% ~ (p=0.967 n=20+19) Change-Id: I393f3f20e2f31538d1b1dd70d6e5c201c106a095 Reviewed-on: https://go-review.googlesource.com/16773 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Klaus Post Reviewed-by: Keith Randall --- src/cmd/internal/obj/x86/a.out.go | 1 + src/cmd/internal/obj/x86/anames.go | 1 + src/cmd/internal/obj/x86/asm6.go | 1 + src/runtime/memclr_amd64.s | 55 +++++++++++++++++++++++++++++- src/runtime/memmove_test.go | 5 +++ 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 0b5d8eb976..73abe3b705 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -747,6 +747,7 @@ const ( AMOVNTHD AMOVHDA AVPCMPEQB + AVPXOR AVPMOVMSKB AVPAND AVPTEST diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index f545baf994..d94d7eaff1 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -688,6 +688,7 @@ var Anames = []string{ "MOVNTHD", "MOVHDA", "VPCMPEQB", + "VPXOR", "VPMOVMSKB", "VPAND", "VPTEST", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index ed728aa727..19aee0df44 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -1514,6 +1514,7 @@ var optab = {AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}}, {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}}, {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, + {AVPXOR, yxm_xm_xm, Pvex1, [23]uint8{0xef, 0xef}}, {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}}, {AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}}, diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 3e2c4b241a..5e78037df6 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -36,8 +36,10 @@ tail: JBE _65through128 CMPQ BX, $256 JBE _129through256 + CMPB runtime·support_avx2(SB), $1 + JE loop_preheader_avx2 // TODO: use branch table and BSR to make this just a single dispatch - // TODO: for really big clears, use MOVNTDQ. + // TODO: for really big clears, use MOVNTDQ, even without AVX2. loop: MOVOU X0, 0(DI) @@ -62,6 +64,57 @@ loop: JAE loop JMP tail +loop_preheader_avx2: + VPXOR X0, X0, X0 + // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. + // For larger sizes it is always faster, even on dual Xeons with 30M cache. + // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. + CMPQ BX, $0x2000000 + JAE loop_preheader_avx2_huge +loop_avx2: + MOVHDU X0, 0(DI) + MOVHDU X0, 32(DI) + MOVHDU X0, 64(DI) + MOVHDU X0, 96(DI) + SUBQ $128, BX + ADDQ $128, DI + CMPQ BX, $128 + JAE loop_avx2 + MOVHDU X0, -32(DI)(BX*1) + MOVHDU X0, -64(DI)(BX*1) + MOVHDU X0, -96(DI)(BX*1) + MOVHDU X0, -128(DI)(BX*1) + VZEROUPPER + RET +loop_preheader_avx2_huge: + // Align to 32 byte boundary + MOVHDU X0, 0(DI) + MOVQ DI, SI + ADDQ $32, DI + ANDQ $~31, DI + SUBQ DI, SI + ADDQ SI, BX +loop_avx2_huge: + MOVNTHD X0, 0(DI) + MOVNTHD X0, 32(DI) + MOVNTHD X0, 64(DI) + MOVNTHD X0, 96(DI) + SUBQ $128, BX + ADDQ $128, DI + CMPQ BX, $128 + JAE loop_avx2_huge + // In the desciption of MOVNTDQ in [1] + // "... fencing operation implemented with the SFENCE or MFENCE instruction + // should be used in conjunction with MOVNTDQ instructions..." + // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf + SFENCE + MOVHDU X0, -32(DI)(BX*1) + MOVHDU X0, -64(DI)(BX*1) + MOVHDU X0, -96(DI)(BX*1) + MOVHDU X0, -128(DI)(BX*1) + VZEROUPPER + RET + _1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index d5a2ad8372..7f9d3f1427 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -196,6 +196,11 @@ func BenchmarkMemclr64(b *testing.B) { bmMemclr(b, 64) } func BenchmarkMemclr256(b *testing.B) { bmMemclr(b, 256) } func BenchmarkMemclr4096(b *testing.B) { bmMemclr(b, 4096) } func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) } +func BenchmarkMemclr1M(b *testing.B) { bmMemclr(b, 1<<20) } +func BenchmarkMemclr4M(b *testing.B) { bmMemclr(b, 4<<20) } +func BenchmarkMemclr8M(b *testing.B) { bmMemclr(b, 8<<20) } +func BenchmarkMemclr16M(b *testing.B) { bmMemclr(b, 16<<20) } +func BenchmarkMemclr64M(b *testing.B) { bmMemclr(b, 64<<20) } func bmGoMemclr(b *testing.B, n int) { x := make([]byte, n) -- 2.48.1