runtime: store pointer-size words in memclr

author nimelehin <nimelehin@gmail.com>

Mon, 9 May 2022 20:22:14 +0000 (23:22 +0300)

committer Keith Randall <khr@golang.org>

Tue, 10 May 2022 20:52:34 +0000 (20:52 +0000)
author nimelehin <nimelehin@gmail.com>
Mon, 9 May 2022 20:22:14 +0000 (23:22 +0300)
committer Keith Randall <khr@golang.org>
Tue, 10 May 2022 20:52:34 +0000 (20:52 +0000)
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s

index b8f283b8fdcd56502bd814b50645a4b48b9553a3..19bfa6f20d49b5b0eaf27f07f8f402922e83535f 100644 (file)
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -18,7 +18,7 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
         MOVQ    AX, DI  // DI = ptr
         XORQ    AX, AX
  
-       // MOVOU seems always faster than REP STOSQ.
+       // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
  tail:
         // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
         TESTQ   BX, BX
@@ -119,9 +119,13 @@ loop_preheader_erms:
         JAE     loop_preheader_avx2_huge
  
  loop_erms:
+       // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
+       // for a memory subsystem as the GC requires this.
         MOVQ    BX, CX
-       REP;    STOSB
-       RET
+       SHRQ    $3, CX
+       ANDQ    $7, BX
+       REP;    STOSQ
+       JMP     tail
  
  loop_preheader_avx2_huge:
         // Align to 32 byte boundary
author	nimelehin <nimelehin@gmail.com>
	Mon, 9 May 2022 20:22:14 +0000 (23:22 +0300)
committer	Keith Randall <khr@golang.org>
	Tue, 10 May 2022 20:52:34 +0000 (20:52 +0000)