From 330cffb86951414da5ef2fde912167f6b4d1d91e Mon Sep 17 00:00:00 2001 From: vpachkov Date: Fri, 1 Apr 2022 20:37:30 +0300 Subject: [PATCH] runtime: remove dead code and unnecessary checks for amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use amd64 assembly header to remove unnecessary cpu flags checks and dead code that is guaranteed to not be executed when compiling for specific microarchitectures. name old time/op new time/op delta BytesCompare/1-12 3.88ns ± 1% 3.18ns ± 1% -18.15% (p=0.008 n=5+5) BytesCompare/2-12 3.89ns ± 1% 3.21ns ± 2% -17.66% (p=0.008 n=5+5) BytesCompare/4-12 3.89ns ± 0% 3.17ns ± 0% -18.62% (p=0.008 n=5+5) BytesCompare/8-12 3.44ns ± 2% 3.39ns ± 1% -1.36% (p=0.008 n=5+5) BytesCompare/16-12 3.40ns ± 1% 3.14ns ± 0% -7.77% (p=0.008 n=5+5) BytesCompare/32-12 3.90ns ± 1% 3.65ns ± 0% -6.19% (p=0.008 n=5+5) BytesCompare/64-12 4.96ns ± 1% 4.71ns ± 2% -4.98% (p=0.008 n=5+5) BytesCompare/128-12 6.42ns ± 0% 5.99ns ± 4% -6.75% (p=0.008 n=5+5) BytesCompare/256-12 9.36ns ± 0% 7.40ns ± 0% -20.97% (p=0.008 n=5+5) BytesCompare/512-12 15.9ns ± 1% 11.4ns ± 1% -28.36% (p=0.008 n=5+5) BytesCompare/1024-12 27.0ns ± 0% 19.3ns ± 0% -28.36% (p=0.008 n=5+5) BytesCompare/2048-12 50.2ns ± 0% 43.3ns ± 0% -13.71% (p=0.008 n=5+5) [Geo mean] 7.13ns 6.07ns -14.86% name old speed new speed delta Count/10-12 723MB/s ± 0% 704MB/s ± 1% -2.73% (p=0.008 n=5+5) Count/32-12 2.21GB/s ± 0% 2.12GB/s ± 2% -4.21% (p=0.008 n=5+5) Count/4K-12 1.03GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=5+5) Count/4M-12 1.04GB/s ± 0% 1.02GB/s ± 2% ~ (p=0.310 n=5+5) Count/64M-12 1.02GB/s ± 0% 1.01GB/s ± 1% -1.00% (p=0.016 n=5+5) CountEasy/10-12 779MB/s ± 0% 768MB/s ± 1% -1.48% (p=0.008 n=5+5) CountEasy/32-12 2.15GB/s ± 0% 2.09GB/s ± 1% -2.71% (p=0.008 n=5+5) CountEasy/4K-12 45.1GB/s ± 1% 45.2GB/s ± 1% ~ (p=0.421 n=5+5) CountEasy/4M-12 36.4GB/s ± 1% 36.5GB/s ± 1% ~ (p=0.690 n=5+5) CountEasy/64M-12 16.1GB/s ± 2% 16.4GB/s ± 1% ~ (p=0.056 n=5+5) CountSingle/10-12 2.15GB/s ± 2% 2.22GB/s ± 1% +3.37% (p=0.008 n=5+5) CountSingle/32-12 5.86GB/s ± 1% 5.76GB/s ± 1% -1.55% (p=0.008 n=5+5) CountSingle/4K-12 54.6GB/s ± 1% 55.0GB/s ± 1% ~ (p=0.548 n=5+5) CountSingle/4M-12 45.9GB/s ± 4% 46.4GB/s ± 2% ~ (p=0.548 n=5+5) CountSingle/64M-12 17.3GB/s ± 1% 17.2GB/s ± 2% ~ (p=1.000 n=5+5) [Geo mean] 5.11GB/s 5.08GB/s -0.53% name old speed new speed delta Equal/1-12 200MB/s ± 0% 188MB/s ± 1% -6.11% (p=0.008 n=5+5) Equal/6-12 1.20GB/s ± 0% 1.13GB/s ± 1% -6.38% (p=0.008 n=5+5) Equal/9-12 1.67GB/s ± 3% 1.74GB/s ± 1% +3.83% (p=0.008 n=5+5) Equal/15-12 2.82GB/s ± 1% 2.89GB/s ± 1% +2.63% (p=0.008 n=5+5) Equal/16-12 2.96GB/s ± 1% 3.08GB/s ± 1% +3.95% (p=0.008 n=5+5) Equal/20-12 3.33GB/s ± 1% 3.54GB/s ± 1% +6.36% (p=0.008 n=5+5) Equal/32-12 4.57GB/s ± 0% 5.26GB/s ± 1% +15.09% (p=0.008 n=5+5) Equal/4K-12 62.0GB/s ± 1% 65.9GB/s ± 2% +6.29% (p=0.008 n=5+5) Equal/4M-12 23.6GB/s ± 2% 24.8GB/s ± 4% +5.43% (p=0.008 n=5+5) Equal/64M-12 11.1GB/s ± 2% 11.3GB/s ± 1% +1.69% (p=0.008 n=5+5) [Geo mean] 3.91GB/s 4.03GB/s +3.11% name old speed new speed delta IndexByte/10-12 2.64GB/s ± 0% 2.69GB/s ± 0% +1.67% (p=0.008 n=5+5) IndexByte/32-12 6.79GB/s ± 0% 6.27GB/s ± 0% -7.57% (p=0.008 n=5+5) IndexByte/4K-12 56.2GB/s ± 0% 56.9GB/s ± 0% +1.27% (p=0.008 n=5+5) IndexByte/4M-12 40.1GB/s ± 1% 41.7GB/s ± 1% +4.05% (p=0.008 n=5+5) IndexByte/64M-12 17.5GB/s ± 0% 17.7GB/s ± 1% ~ (p=0.095 n=5+5) IndexBytePortable/10-12 2.06GB/s ± 1% 2.16GB/s ± 1% +5.08% (p=0.008 n=5+5) IndexBytePortable/32-12 1.40GB/s ± 1% 1.54GB/s ± 1% +10.05% (p=0.008 n=5+5) IndexBytePortable/4K-12 3.99GB/s ± 0% 4.08GB/s ± 0% +2.16% (p=0.008 n=5+5) IndexBytePortable/4M-12 4.05GB/s ± 1% 4.08GB/s ± 2% ~ (p=0.095 n=5+5) IndexBytePortable/64M-12 3.80GB/s ± 1% 3.81GB/s ± 0% ~ (p=0.421 n=5+5) IndexRune/10-12 746MB/s ± 1% 752MB/s ± 0% +0.85% (p=0.008 n=5+5) IndexRune/32-12 2.33GB/s ± 0% 2.42GB/s ± 0% +3.66% (p=0.008 n=5+5) IndexRune/4K-12 44.4GB/s ± 0% 44.2GB/s ± 0% ~ (p=0.095 n=5+5) IndexRune/4M-12 36.2GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.841 n=5+5) IndexRune/64M-12 16.2GB/s ± 2% 16.3GB/s ± 2% ~ (p=0.548 n=5+5) IndexRuneASCII/10-12 2.57GB/s ± 0% 2.58GB/s ± 0% +0.63% (p=0.008 n=5+5) IndexRuneASCII/32-12 6.00GB/s ± 0% 6.30GB/s ± 1% +4.98% (p=0.008 n=5+5) IndexRuneASCII/4K-12 56.7GB/s ± 0% 56.8GB/s ± 1% ~ (p=0.151 n=5+5) IndexRuneASCII/4M-12 41.6GB/s ± 1% 41.7GB/s ± 2% ~ (p=0.151 n=5+5) IndexRuneASCII/64M-12 17.7GB/s ± 1% 17.6GB/s ± 1% ~ (p=0.222 n=5+5) Index/10-12 1.06GB/s ± 1% 1.06GB/s ± 0% ~ (p=0.310 n=5+5) Index/32-12 3.57GB/s ± 0% 3.56GB/s ± 1% ~ (p=0.056 n=5+5) Index/4K-12 1.02GB/s ± 2% 1.03GB/s ± 0% ~ (p=0.690 n=5+5) Index/4M-12 1.04GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=4+5) Index/64M-12 1.02GB/s ± 0% 1.02GB/s ± 0% ~ (p=0.905 n=5+4) IndexEasy/10-12 1.12GB/s ± 2% 1.15GB/s ± 1% +3.10% (p=0.008 n=5+5) IndexEasy/32-12 3.14GB/s ± 2% 3.13GB/s ± 1% ~ (p=0.310 n=5+5) IndexEasy/4K-12 47.6GB/s ± 1% 47.7GB/s ± 2% ~ (p=0.310 n=5+5) IndexEasy/4M-12 36.4GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.690 n=5+5) IndexEasy/64M-12 16.1GB/s ± 1% 16.4GB/s ± 5% ~ (p=0.151 n=5+5) [Geo mean] 6.39GB/s 6.46GB/s +1.11% Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82 Reviewed-on: https://go-review.googlesource.com/c/go/+/397576 Reviewed-by: Keith Randall Run-TryBot: Keith Randall TryBot-Result: Gopher Robot Reviewed-by: Michael Knyszek --- src/internal/bytealg/compare_amd64.s | 7 +++++++ src/internal/bytealg/count_amd64.s | 7 +++++++ src/internal/bytealg/equal_amd64.s | 3 +++ src/internal/bytealg/index_amd64.s | 2 ++ src/internal/bytealg/indexbyte_amd64.s | 2 ++ src/runtime/asm_amd64.h | 11 +++++++++++ src/runtime/mkpreempt.go | 5 +++++ src/runtime/preempt_amd64.s | 3 +++ 8 files changed, 40 insertions(+) diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s index 4ccaca5e87..fdd015f560 100644 --- a/src/internal/bytealg/compare_amd64.s +++ b/src/internal/bytealg/compare_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Compare(SB),NOSPLIT,$0-56 @@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0 CMPQ R8, $63 JBE loop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JEQ big_loop_avx2 JMP big_loop +#else + JMP big_loop_avx2 +#endif loop: CMPQ R8, $16 JBE _0through16 @@ -155,6 +160,7 @@ allsame: RET // this works for >= 64 bytes of data. +#ifndef hasAVX2 big_loop: MOVOU (SI), X0 MOVOU (DI), X1 @@ -190,6 +196,7 @@ big_loop: CMPQ R8, $64 JBE loop JMP big_loop +#endif // Compare 64-bytes per loop iteration. // Loop is unrolled and uses AVX2. diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s index fa864c4c76..efb17f84b7 100644 --- a/src/internal/bytealg/count_amd64.s +++ b/src/internal/bytealg/count_amd64.s @@ -3,12 +3,15 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Count(SB),NOSPLIT,$0-40 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGeneric(SB) +#endif MOVQ b_base+0(FP), SI MOVQ b_len+8(FP), BX MOVB c+24(FP), AL @@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40 JMP countbody<>(SB) TEXT ·CountString(SB),NOSPLIT,$0-32 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGenericString(SB) +#endif MOVQ s_base+0(FP), SI MOVQ s_len+8(FP), BX MOVB c+16(FP), AL @@ -151,8 +156,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s index dd46e2e0fd..d178a33779 100644 --- a/src/internal/bytealg/equal_amd64.s +++ b/src/internal/bytealg/equal_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" // memequal(a, b unsafe.Pointer, size uintptr) bool @@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0 JB small CMPQ BX, $64 JB bigloop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JE hugeloop_avx2 @@ -76,6 +78,7 @@ hugeloop: JEQ hugeloop XORQ AX, AX // return 0 RET +#endif // 64 bytes at a time using ymm registers hugeloop_avx2: diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s index 6193b57239..04314917b8 100644 --- a/src/internal/bytealg/index_amd64.s +++ b/src/internal/bytealg/index_amd64.s @@ -233,8 +233,10 @@ success_avx2: VZEROUPPER JMP success sse42: +#ifndef hasSSE42 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 JNE no_sse42 +#endif CMPQ AX, $12 // PCMPESTRI is slower than normal compare, // so using it makes sense only if we advance 4+ bytes per compare diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s index f78093c539..1ca70e39e2 100644 --- a/src/internal/bytealg/indexbyte_amd64.s +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -115,8 +115,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h index 49e0ee2323..f7a8896db6 100644 --- a/src/runtime/asm_amd64.h +++ b/src/runtime/asm_amd64.h @@ -5,10 +5,21 @@ // Define features that are guaranteed to be supported by setting the AMD64 variable. // If a feature is supported, there's no need to check it at runtime every time. +#ifdef GOAMD64_v2 +#define hasPOPCNT +#define hasSSE42 +#endif + #ifdef GOAMD64_v3 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif #ifdef GOAMD64_v4 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 28befcbd0d..61d2d0247e 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -126,6 +126,9 @@ func header(arch string) { fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base) } fmt.Fprintf(out, "#include \"go_asm.h\"\n") + if arch == "amd64" { + fmt.Fprintf(out, "#include \"asm_amd64.h\"\n") + } fmt.Fprintf(out, "#include \"textflag.h\"\n\n") fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n") } @@ -267,8 +270,10 @@ func genAMD64() { // Clear the upper bits to get to a clean state. See issue #37174. // It is safe here as Go code don't use the upper bits of Y registers. p("#ifdef GOOS_darwin") + p("#ifndef hasAVX") p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0") p("JE 2(PC)") + p("#endif") p("VZEROUPPER") p("#endif") diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 31f7c8b66f..94a84fb74c 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -1,6 +1,7 @@ // Code generated by mkpreempt.go; DO NOT EDIT. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 @@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ R14, 96(SP) MOVQ R15, 104(SP) #ifdef GOOS_darwin + #ifndef hasAVX CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0 JE 2(PC) + #endif VZEROUPPER #endif MOVUPS X0, 112(SP) -- 2.50.0