From e0f99775f24cd19ad96847b0a5b00952aac9d548 Mon Sep 17 00:00:00 2001 From: nimelehin Date: Mon, 9 May 2022 23:22:14 +0300 Subject: [PATCH] runtime: store pointer-size words in memclr GC requires the whole zeroed word to be visible for a memory subsystem. While the implementation of Enhanced REP STOSB tries to use as efficient stores as possible, e.g writing the whole cache line and not byte-after-byte, we should use REP STOSQ to guarantee the requirements of the GC. The performance is not affected. Change-Id: I1b0fd1444a40bfbb661541291ab96eba11bcc762 Reviewed-on: https://go-review.googlesource.com/c/go/+/405274 Reviewed-by: Cherry Mui Reviewed-by: Keith Randall Reviewed-by: Keith Randall --- src/runtime/memclr_amd64.s | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index b8f283b8fd..19bfa6f20d 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -18,7 +18,7 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 MOVQ AX, DI // DI = ptr XORQ AX, AX - // MOVOU seems always faster than REP STOSQ. + // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available. tail: // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. TESTQ BX, BX @@ -119,9 +119,13 @@ loop_preheader_erms: JAE loop_preheader_avx2_huge loop_erms: + // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible + // for a memory subsystem as the GC requires this. MOVQ BX, CX - REP; STOSB - RET + SHRQ $3, CX + ANDQ $7, BX + REP; STOSQ + JMP tail loop_preheader_avx2_huge: // Align to 32 byte boundary -- 2.48.1