From 330cffb86951414da5ef2fde912167f6b4d1d91e Mon Sep 17 00:00:00 2001
From: vpachkov <slava.pach@gmail.com>
Date: Fri, 1 Apr 2022 20:37:30 +0300
Subject: [PATCH] runtime: remove dead code and unnecessary checks for amd64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Use amd64 assembly header to remove unnecessary cpu flags checks
and dead code that is guaranteed to not be executed when compiling
for specific microarchitectures.

name                  old time/op  new time/op  delta
BytesCompare/1-12     3.88ns Â± 1%  3.18ns Â± 1%  -18.15%  (p=0.008 n=5+5)
BytesCompare/2-12     3.89ns Â± 1%  3.21ns Â± 2%  -17.66%  (p=0.008 n=5+5)
BytesCompare/4-12     3.89ns Â± 0%  3.17ns Â± 0%  -18.62%  (p=0.008 n=5+5)
BytesCompare/8-12     3.44ns Â± 2%  3.39ns Â± 1%   -1.36%  (p=0.008 n=5+5)
BytesCompare/16-12    3.40ns Â± 1%  3.14ns Â± 0%   -7.77%  (p=0.008 n=5+5)
BytesCompare/32-12    3.90ns Â± 1%  3.65ns Â± 0%   -6.19%  (p=0.008 n=5+5)
BytesCompare/64-12    4.96ns Â± 1%  4.71ns Â± 2%   -4.98%  (p=0.008 n=5+5)
BytesCompare/128-12   6.42ns Â± 0%  5.99ns Â± 4%   -6.75%  (p=0.008 n=5+5)
BytesCompare/256-12   9.36ns Â± 0%  7.40ns Â± 0%  -20.97%  (p=0.008 n=5+5)
BytesCompare/512-12   15.9ns Â± 1%  11.4ns Â± 1%  -28.36%  (p=0.008 n=5+5)
BytesCompare/1024-12  27.0ns Â± 0%  19.3ns Â± 0%  -28.36%  (p=0.008 n=5+5)
BytesCompare/2048-12  50.2ns Â± 0%  43.3ns Â± 0%  -13.71%  (p=0.008 n=5+5)
[Geo mean]            7.13ns       6.07ns       -14.86%

name                old speed      new speed      delta
Count/10-12          723MB/s Â± 0%   704MB/s Â± 1%  -2.73%  (p=0.008 n=5+5)
Count/32-12         2.21GB/s Â± 0%  2.12GB/s Â± 2%  -4.21%  (p=0.008 n=5+5)
Count/4K-12         1.03GB/s Â± 0%  1.03GB/s Â± 1%    ~     (p=1.000 n=5+5)
Count/4M-12         1.04GB/s Â± 0%  1.02GB/s Â± 2%    ~     (p=0.310 n=5+5)
Count/64M-12        1.02GB/s Â± 0%  1.01GB/s Â± 1%  -1.00%  (p=0.016 n=5+5)
CountEasy/10-12      779MB/s Â± 0%   768MB/s Â± 1%  -1.48%  (p=0.008 n=5+5)
CountEasy/32-12     2.15GB/s Â± 0%  2.09GB/s Â± 1%  -2.71%  (p=0.008 n=5+5)
CountEasy/4K-12     45.1GB/s Â± 1%  45.2GB/s Â± 1%    ~     (p=0.421 n=5+5)
CountEasy/4M-12     36.4GB/s Â± 1%  36.5GB/s Â± 1%    ~     (p=0.690 n=5+5)
CountEasy/64M-12    16.1GB/s Â± 2%  16.4GB/s Â± 1%    ~     (p=0.056 n=5+5)
CountSingle/10-12   2.15GB/s Â± 2%  2.22GB/s Â± 1%  +3.37%  (p=0.008 n=5+5)
CountSingle/32-12   5.86GB/s Â± 1%  5.76GB/s Â± 1%  -1.55%  (p=0.008 n=5+5)
CountSingle/4K-12   54.6GB/s Â± 1%  55.0GB/s Â± 1%    ~     (p=0.548 n=5+5)
CountSingle/4M-12   45.9GB/s Â± 4%  46.4GB/s Â± 2%    ~     (p=0.548 n=5+5)
CountSingle/64M-12  17.3GB/s Â± 1%  17.2GB/s Â± 2%    ~     (p=1.000 n=5+5)
[Geo mean]          5.11GB/s       5.08GB/s       -0.53%

name          old speed      new speed      delta
Equal/1-12     200MB/s Â± 0%   188MB/s Â± 1%   -6.11%  (p=0.008 n=5+5)
Equal/6-12    1.20GB/s Â± 0%  1.13GB/s Â± 1%   -6.38%  (p=0.008 n=5+5)
Equal/9-12    1.67GB/s Â± 3%  1.74GB/s Â± 1%   +3.83%  (p=0.008 n=5+5)
Equal/15-12   2.82GB/s Â± 1%  2.89GB/s Â± 1%   +2.63%  (p=0.008 n=5+5)
Equal/16-12   2.96GB/s Â± 1%  3.08GB/s Â± 1%   +3.95%  (p=0.008 n=5+5)
Equal/20-12   3.33GB/s Â± 1%  3.54GB/s Â± 1%   +6.36%  (p=0.008 n=5+5)
Equal/32-12   4.57GB/s Â± 0%  5.26GB/s Â± 1%  +15.09%  (p=0.008 n=5+5)
Equal/4K-12   62.0GB/s Â± 1%  65.9GB/s Â± 2%   +6.29%  (p=0.008 n=5+5)
Equal/4M-12   23.6GB/s Â± 2%  24.8GB/s Â± 4%   +5.43%  (p=0.008 n=5+5)
Equal/64M-12  11.1GB/s Â± 2%  11.3GB/s Â± 1%   +1.69%  (p=0.008 n=5+5)
[Geo mean]    3.91GB/s       4.03GB/s        +3.11%

name                              old speed      new speed      delta
IndexByte/10-12                   2.64GB/s Â± 0%  2.69GB/s Â± 0%   +1.67%  (p=0.008 n=5+5)
IndexByte/32-12                   6.79GB/s Â± 0%  6.27GB/s Â± 0%   -7.57%  (p=0.008 n=5+5)
IndexByte/4K-12                   56.2GB/s Â± 0%  56.9GB/s Â± 0%   +1.27%  (p=0.008 n=5+5)
IndexByte/4M-12                   40.1GB/s Â± 1%  41.7GB/s Â± 1%   +4.05%  (p=0.008 n=5+5)
IndexByte/64M-12                  17.5GB/s Â± 0%  17.7GB/s Â± 1%     ~     (p=0.095 n=5+5)
IndexBytePortable/10-12           2.06GB/s Â± 1%  2.16GB/s Â± 1%   +5.08%  (p=0.008 n=5+5)
IndexBytePortable/32-12           1.40GB/s Â± 1%  1.54GB/s Â± 1%  +10.05%  (p=0.008 n=5+5)
IndexBytePortable/4K-12           3.99GB/s Â± 0%  4.08GB/s Â± 0%   +2.16%  (p=0.008 n=5+5)
IndexBytePortable/4M-12           4.05GB/s Â± 1%  4.08GB/s Â± 2%     ~     (p=0.095 n=5+5)
IndexBytePortable/64M-12          3.80GB/s Â± 1%  3.81GB/s Â± 0%     ~     (p=0.421 n=5+5)
IndexRune/10-12                    746MB/s Â± 1%   752MB/s Â± 0%   +0.85%  (p=0.008 n=5+5)
IndexRune/32-12                   2.33GB/s Â± 0%  2.42GB/s Â± 0%   +3.66%  (p=0.008 n=5+5)
IndexRune/4K-12                   44.4GB/s Â± 0%  44.2GB/s Â± 0%     ~     (p=0.095 n=5+5)
IndexRune/4M-12                   36.2GB/s Â± 1%  36.3GB/s Â± 2%     ~     (p=0.841 n=5+5)
IndexRune/64M-12                  16.2GB/s Â± 2%  16.3GB/s Â± 2%     ~     (p=0.548 n=5+5)
IndexRuneASCII/10-12              2.57GB/s Â± 0%  2.58GB/s Â± 0%   +0.63%  (p=0.008 n=5+5)
IndexRuneASCII/32-12              6.00GB/s Â± 0%  6.30GB/s Â± 1%   +4.98%  (p=0.008 n=5+5)
IndexRuneASCII/4K-12              56.7GB/s Â± 0%  56.8GB/s Â± 1%     ~     (p=0.151 n=5+5)
IndexRuneASCII/4M-12              41.6GB/s Â± 1%  41.7GB/s Â± 2%     ~     (p=0.151 n=5+5)
IndexRuneASCII/64M-12             17.7GB/s Â± 1%  17.6GB/s Â± 1%     ~     (p=0.222 n=5+5)
Index/10-12                       1.06GB/s Â± 1%  1.06GB/s Â± 0%     ~     (p=0.310 n=5+5)
Index/32-12                       3.57GB/s Â± 0%  3.56GB/s Â± 1%     ~     (p=0.056 n=5+5)
Index/4K-12                       1.02GB/s Â± 2%  1.03GB/s Â± 0%     ~     (p=0.690 n=5+5)
Index/4M-12                       1.04GB/s Â± 0%  1.03GB/s Â± 1%     ~     (p=1.000 n=4+5)
Index/64M-12                      1.02GB/s Â± 0%  1.02GB/s Â± 0%     ~     (p=0.905 n=5+4)
IndexEasy/10-12                   1.12GB/s Â± 2%  1.15GB/s Â± 1%   +3.10%  (p=0.008 n=5+5)
IndexEasy/32-12                   3.14GB/s Â± 2%  3.13GB/s Â± 1%     ~     (p=0.310 n=5+5)
IndexEasy/4K-12                   47.6GB/s Â± 1%  47.7GB/s Â± 2%     ~     (p=0.310 n=5+5)
IndexEasy/4M-12                   36.4GB/s Â± 1%  36.3GB/s Â± 2%     ~     (p=0.690 n=5+5)
IndexEasy/64M-12                  16.1GB/s Â± 1%  16.4GB/s Â± 5%     ~     (p=0.151 n=5+5)
[Geo mean]                        6.39GB/s       6.46GB/s        +1.11%

Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82
Reviewed-on: https://go-review.googlesource.com/c/go/+/397576
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
---
 src/internal/bytealg/compare_amd64.s   |  7 +++++++
 src/internal/bytealg/count_amd64.s     |  7 +++++++
 src/internal/bytealg/equal_amd64.s     |  3 +++
 src/internal/bytealg/index_amd64.s     |  2 ++
 src/internal/bytealg/indexbyte_amd64.s |  2 ++
 src/runtime/asm_amd64.h                | 11 +++++++++++
 src/runtime/mkpreempt.go               |  5 +++++
 src/runtime/preempt_amd64.s            |  3 +++
 8 files changed, 40 insertions(+)

diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
index 4ccaca5e87..fdd015f560 100644
--- a/src/internal/bytealg/compare_amd64.s
+++ b/src/internal/bytealg/compare_amd64.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT Â·Compare<ABIInternal>(SB),NOSPLIT,$0-56
@@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
 
 	CMPQ	R8, $63
 	JBE	loop
+#ifndef hasAVX2
 	CMPB	internalâcpuÂ·X86+const_offsetX86HasAVX2(SB), $1
 	JEQ     big_loop_avx2
 	JMP	big_loop
+#else
+	JMP	big_loop_avx2
+#endif
 loop:
 	CMPQ	R8, $16
 	JBE	_0through16
@@ -155,6 +160,7 @@ allsame:
 	RET
 
 	// this works for >= 64 bytes of data.
+#ifndef hasAVX2
 big_loop:
 	MOVOU	(SI), X0
 	MOVOU	(DI), X1
@@ -190,6 +196,7 @@ big_loop:
 	CMPQ	R8, $64
 	JBE	loop
 	JMP	big_loop
+#endif
 
 	// Compare 64-bytes per loop iteration.
 	// Loop is unrolled and uses AVX2.
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
index fa864c4c76..efb17f84b7 100644
--- a/src/internal/bytealg/count_amd64.s
+++ b/src/internal/bytealg/count_amd64.s
@@ -3,12 +3,15 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT Â·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
 	CMPB	internalâcpuÂ·X86+const_offsetX86HasPOPCNT(SB), $1
 	JEQ	2(PC)
 	JMP	Â·countGeneric(SB)
+#endif
 	MOVQ	b_base+0(FP), SI
 	MOVQ	b_len+8(FP), BX
 	MOVB	c+24(FP), AL
@@ -16,9 +19,11 @@ TEXT Â·Count(SB),NOSPLIT,$0-40
 	JMP	countbody<>(SB)
 
 TEXT Â·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
 	CMPB	internalâcpuÂ·X86+const_offsetX86HasPOPCNT(SB), $1
 	JEQ	2(PC)
 	JMP	Â·countGenericString(SB)
+#endif
 	MOVQ	s_base+0(FP), SI
 	MOVQ	s_len+8(FP), BX
 	MOVB	c+16(FP), AL
@@ -151,8 +156,10 @@ endofpage:
 	RET
 
 avx2:
+#ifndef hasAVX2
 	CMPB   internalâcpuÂ·X86+const_offsetX86HasAVX2(SB), $1
 	JNE sse
+#endif
 	MOVD AX, X0
 	LEAQ -32(SI)(BX*1), R11
 	VPBROADCASTB  X0, Y1
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
index dd46e2e0fd..d178a33779 100644
--- a/src/internal/bytealg/equal_amd64.s
+++ b/src/internal/bytealg/equal_amd64.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 // memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
 	JB	small
 	CMPQ	BX, $64
 	JB	bigloop
+#ifndef hasAVX2
 	CMPB	internalâcpuÂ·X86+const_offsetX86HasAVX2(SB), $1
 	JE	hugeloop_avx2
 
@@ -76,6 +78,7 @@ hugeloop:
 	JEQ	hugeloop
 	XORQ	AX, AX	// return 0
 	RET
+#endif
 
 	// 64 bytes at a time using ymm registers
 hugeloop_avx2:
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
index 6193b57239..04314917b8 100644
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -233,8 +233,10 @@ success_avx2:
 	VZEROUPPER
 	JMP success
 sse42:
+#ifndef hasSSE42
 	CMPB internalâcpuÂ·X86+const_offsetX86HasSSE42(SB), $1
 	JNE no_sse42
+#endif
 	CMPQ AX, $12
 	// PCMPESTRI is slower than normal compare,
 	// so using it makes sense only if we advance 4+ bytes per compare
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
index f78093c539..1ca70e39e2 100644
--- a/src/internal/bytealg/indexbyte_amd64.s
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -115,8 +115,10 @@ endofpage:
 	RET
 
 avx2:
+#ifndef hasAVX2
 	CMPB   internalâcpuÂ·X86+const_offsetX86HasAVX2(SB), $1
 	JNE sse
+#endif
 	MOVD AX, X0
 	LEAQ -32(SI)(BX*1), R11
 	VPBROADCASTB  X0, Y1
diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h
index 49e0ee2323..f7a8896db6 100644
--- a/src/runtime/asm_amd64.h
+++ b/src/runtime/asm_amd64.h
@@ -5,10 +5,21 @@
 // Define features that are guaranteed to be supported by setting the AMD64 variable.
 // If a feature is supported, there's no need to check it at runtime every time.
 
+#ifdef GOAMD64_v2
+#define hasPOPCNT
+#define hasSSE42
+#endif
+
 #ifdef GOAMD64_v3
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
 
 #ifdef GOAMD64_v4
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 28befcbd0d..61d2d0247e 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -126,6 +126,9 @@ func header(arch string) {
 		fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
 	}
 	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+	if arch == "amd64" {
+		fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
+	}
 	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
 	fmt.Fprintf(out, "TEXT Â·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
@@ -267,8 +270,10 @@ func genAMD64() {
 	// Clear the upper bits to get to a clean state. See issue #37174.
 	// It is safe here as Go code don't use the upper bits of Y registers.
 	p("#ifdef GOOS_darwin")
+	p("#ifndef hasAVX")
 	p("CMPB internalâcpuÂ·X86+const_offsetX86HasAVX(SB), $0")
 	p("JE 2(PC)")
+	p("#endif")
 	p("VZEROUPPER")
 	p("#endif")
 
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 31f7c8b66f..94a84fb74c 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -1,6 +1,7 @@
 // Code generated by mkpreempt.go; DO NOT EDIT.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT Â·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
@@ -27,8 +28,10 @@ TEXT Â·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ R14, 96(SP)
 	MOVQ R15, 104(SP)
 	#ifdef GOOS_darwin
+	#ifndef hasAVX
 	CMPB internalâcpuÂ·X86+const_offsetX86HasAVX(SB), $0
 	JE 2(PC)
+	#endif
 	VZEROUPPER
 	#endif
 	MOVUPS X0, 112(SP)
-- 
2.50.0