]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: store pointer-size words in memclr
authornimelehin <nimelehin@gmail.com>
Mon, 9 May 2022 20:22:14 +0000 (23:22 +0300)
committerKeith Randall <khr@golang.org>
Tue, 10 May 2022 20:52:34 +0000 (20:52 +0000)
GC requires the whole zeroed word to be visible for a memory subsystem.
While the implementation of Enhanced REP STOSB tries to use as efficient
stores as possible, e.g writing the whole cache line and not byte-after-byte,
we should use REP STOSQ to guarantee the requirements of the GC.

The performance is not affected.

Change-Id: I1b0fd1444a40bfbb661541291ab96eba11bcc762
Reviewed-on: https://go-review.googlesource.com/c/go/+/405274
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/runtime/memclr_amd64.s

index b8f283b8fdcd56502bd814b50645a4b48b9553a3..19bfa6f20d49b5b0eaf27f07f8f402922e83535f 100644 (file)
@@ -18,7 +18,7 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
        MOVQ    AX, DI  // DI = ptr
        XORQ    AX, AX
 
-       // MOVOU seems always faster than REP STOSQ.
+       // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
 tail:
        // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
        TESTQ   BX, BX
@@ -119,9 +119,13 @@ loop_preheader_erms:
        JAE     loop_preheader_avx2_huge
 
 loop_erms:
+       // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
+       // for a memory subsystem as the GC requires this.
        MOVQ    BX, CX
-       REP;    STOSB
-       RET
+       SHRQ    $3, CX
+       ANDQ    $7, BX
+       REP;    STOSQ
+       JMP     tail
 
 loop_preheader_avx2_huge:
        // Align to 32 byte boundary