runtime: use ERMS in memclr_amd64

author nimelehin <nimelehin@gmail.com>

Fri, 24 Dec 2021 12:29:18 +0000 (15:29 +0300)

committer Gopher Robot <gobot@golang.org>

Mon, 9 May 2022 17:41:45 +0000 (17:41 +0000)
author nimelehin <nimelehin@gmail.com>
Fri, 24 Dec 2021 12:29:18 +0000 (15:29 +0300)
committer Gopher Robot <gobot@golang.org>
Mon, 9 May 2022 17:41:45 +0000 (17:41 +0000)
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s

index 26a6205e615b243cb7ccd1a22c60d074fbe48ab8..b8f283b8fdcd56502bd814b50645a4b48b9553a3 100644 (file)
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -41,9 +41,20 @@ tail:
         CMPQ    BX, $256
         JBE     _129through256
  
+       CMPB    internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
+       JNE     skip_erms
+
+       // If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
+       // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
+       // in the Intel Optimization Guide shows better performance for ERMSB starting
+       // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
+       CMPQ    BX, $2048
+       JAE     loop_preheader_erms
+
+skip_erms:
  #ifndef hasAVX2
         CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
-       JE loop_preheader_avx2
+       JE      loop_preheader_avx2
         // TODO: for really big clears, use MOVNTDQ, even without AVX2.
  
  loop:
@@ -71,12 +82,13 @@ loop:
  #endif
  
  loop_preheader_avx2:
-       VPXOR Y0, Y0, Y0
+       VPXOR X0, X0, X0
         // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
         // For larger sizes it is always faster, even on dual Xeons with 30M cache.
         // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
         CMPQ    BX, $0x2000000
-       JAE     loop_preheader_avx2_huge
+       JAE     loop_preheader_avx2_huge
+
  loop_avx2:
         VMOVDQU Y0, 0(DI)
         VMOVDQU Y0, 32(DI)
@@ -92,6 +104,25 @@ loop_avx2:
         VMOVDQU  Y0, -128(DI)(BX*1)
         VZEROUPPER
         RET
+
+loop_preheader_erms:
+#ifndef hasAVX2
+       CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+       JNE     loop_erms
+#endif
+
+       VPXOR X0, X0, X0
+       // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
+       // write protocol, ERMS could show the same or slower performance comparing to
+       // Non-Temporal Stores when the size is bigger than LLC depending on hardware.
+       CMPQ    BX, $0x2000000
+       JAE     loop_preheader_avx2_huge
+
+loop_erms:
+       MOVQ    BX, CX
+       REP;    STOSB
+       RET
+
  loop_preheader_avx2_huge:
         // Align to 32 byte boundary
         VMOVDQU  Y0, 0(DI)
author	nimelehin <nimelehin@gmail.com>
	Fri, 24 Dec 2021 12:29:18 +0000 (15:29 +0300)
committer	Gopher Robot <gobot@golang.org>
	Mon, 9 May 2022 17:41:45 +0000 (17:41 +0000)