From: vpachkov Date: Fri, 1 Apr 2022 17:37:30 +0000 (+0300) Subject: runtime: remove dead code and unnecessary checks for amd64 X-Git-Tag: go1.20rc1~1550 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=330cffb86951414da5ef2fde912167f6b4d1d91e;p=gostls13.git runtime: remove dead code and unnecessary checks for amd64 Use amd64 assembly header to remove unnecessary cpu flags checks and dead code that is guaranteed to not be executed when compiling for specific microarchitectures. name old time/op new time/op delta BytesCompare/1-12 3.88ns ± 1% 3.18ns ± 1% -18.15% (p=0.008 n=5+5) BytesCompare/2-12 3.89ns ± 1% 3.21ns ± 2% -17.66% (p=0.008 n=5+5) BytesCompare/4-12 3.89ns ± 0% 3.17ns ± 0% -18.62% (p=0.008 n=5+5) BytesCompare/8-12 3.44ns ± 2% 3.39ns ± 1% -1.36% (p=0.008 n=5+5) BytesCompare/16-12 3.40ns ± 1% 3.14ns ± 0% -7.77% (p=0.008 n=5+5) BytesCompare/32-12 3.90ns ± 1% 3.65ns ± 0% -6.19% (p=0.008 n=5+5) BytesCompare/64-12 4.96ns ± 1% 4.71ns ± 2% -4.98% (p=0.008 n=5+5) BytesCompare/128-12 6.42ns ± 0% 5.99ns ± 4% -6.75% (p=0.008 n=5+5) BytesCompare/256-12 9.36ns ± 0% 7.40ns ± 0% -20.97% (p=0.008 n=5+5) BytesCompare/512-12 15.9ns ± 1% 11.4ns ± 1% -28.36% (p=0.008 n=5+5) BytesCompare/1024-12 27.0ns ± 0% 19.3ns ± 0% -28.36% (p=0.008 n=5+5) BytesCompare/2048-12 50.2ns ± 0% 43.3ns ± 0% -13.71% (p=0.008 n=5+5) [Geo mean] 7.13ns 6.07ns -14.86% name old speed new speed delta Count/10-12 723MB/s ± 0% 704MB/s ± 1% -2.73% (p=0.008 n=5+5) Count/32-12 2.21GB/s ± 0% 2.12GB/s ± 2% -4.21% (p=0.008 n=5+5) Count/4K-12 1.03GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=5+5) Count/4M-12 1.04GB/s ± 0% 1.02GB/s ± 2% ~ (p=0.310 n=5+5) Count/64M-12 1.02GB/s ± 0% 1.01GB/s ± 1% -1.00% (p=0.016 n=5+5) CountEasy/10-12 779MB/s ± 0% 768MB/s ± 1% -1.48% (p=0.008 n=5+5) CountEasy/32-12 2.15GB/s ± 0% 2.09GB/s ± 1% -2.71% (p=0.008 n=5+5) CountEasy/4K-12 45.1GB/s ± 1% 45.2GB/s ± 1% ~ (p=0.421 n=5+5) CountEasy/4M-12 36.4GB/s ± 1% 36.5GB/s ± 1% ~ (p=0.690 n=5+5) CountEasy/64M-12 16.1GB/s ± 2% 16.4GB/s ± 1% ~ (p=0.056 n=5+5) CountSingle/10-12 2.15GB/s ± 2% 2.22GB/s ± 1% +3.37% (p=0.008 n=5+5) CountSingle/32-12 5.86GB/s ± 1% 5.76GB/s ± 1% -1.55% (p=0.008 n=5+5) CountSingle/4K-12 54.6GB/s ± 1% 55.0GB/s ± 1% ~ (p=0.548 n=5+5) CountSingle/4M-12 45.9GB/s ± 4% 46.4GB/s ± 2% ~ (p=0.548 n=5+5) CountSingle/64M-12 17.3GB/s ± 1% 17.2GB/s ± 2% ~ (p=1.000 n=5+5) [Geo mean] 5.11GB/s 5.08GB/s -0.53% name old speed new speed delta Equal/1-12 200MB/s ± 0% 188MB/s ± 1% -6.11% (p=0.008 n=5+5) Equal/6-12 1.20GB/s ± 0% 1.13GB/s ± 1% -6.38% (p=0.008 n=5+5) Equal/9-12 1.67GB/s ± 3% 1.74GB/s ± 1% +3.83% (p=0.008 n=5+5) Equal/15-12 2.82GB/s ± 1% 2.89GB/s ± 1% +2.63% (p=0.008 n=5+5) Equal/16-12 2.96GB/s ± 1% 3.08GB/s ± 1% +3.95% (p=0.008 n=5+5) Equal/20-12 3.33GB/s ± 1% 3.54GB/s ± 1% +6.36% (p=0.008 n=5+5) Equal/32-12 4.57GB/s ± 0% 5.26GB/s ± 1% +15.09% (p=0.008 n=5+5) Equal/4K-12 62.0GB/s ± 1% 65.9GB/s ± 2% +6.29% (p=0.008 n=5+5) Equal/4M-12 23.6GB/s ± 2% 24.8GB/s ± 4% +5.43% (p=0.008 n=5+5) Equal/64M-12 11.1GB/s ± 2% 11.3GB/s ± 1% +1.69% (p=0.008 n=5+5) [Geo mean] 3.91GB/s 4.03GB/s +3.11% name old speed new speed delta IndexByte/10-12 2.64GB/s ± 0% 2.69GB/s ± 0% +1.67% (p=0.008 n=5+5) IndexByte/32-12 6.79GB/s ± 0% 6.27GB/s ± 0% -7.57% (p=0.008 n=5+5) IndexByte/4K-12 56.2GB/s ± 0% 56.9GB/s ± 0% +1.27% (p=0.008 n=5+5) IndexByte/4M-12 40.1GB/s ± 1% 41.7GB/s ± 1% +4.05% (p=0.008 n=5+5) IndexByte/64M-12 17.5GB/s ± 0% 17.7GB/s ± 1% ~ (p=0.095 n=5+5) IndexBytePortable/10-12 2.06GB/s ± 1% 2.16GB/s ± 1% +5.08% (p=0.008 n=5+5) IndexBytePortable/32-12 1.40GB/s ± 1% 1.54GB/s ± 1% +10.05% (p=0.008 n=5+5) IndexBytePortable/4K-12 3.99GB/s ± 0% 4.08GB/s ± 0% +2.16% (p=0.008 n=5+5) IndexBytePortable/4M-12 4.05GB/s ± 1% 4.08GB/s ± 2% ~ (p=0.095 n=5+5) IndexBytePortable/64M-12 3.80GB/s ± 1% 3.81GB/s ± 0% ~ (p=0.421 n=5+5) IndexRune/10-12 746MB/s ± 1% 752MB/s ± 0% +0.85% (p=0.008 n=5+5) IndexRune/32-12 2.33GB/s ± 0% 2.42GB/s ± 0% +3.66% (p=0.008 n=5+5) IndexRune/4K-12 44.4GB/s ± 0% 44.2GB/s ± 0% ~ (p=0.095 n=5+5) IndexRune/4M-12 36.2GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.841 n=5+5) IndexRune/64M-12 16.2GB/s ± 2% 16.3GB/s ± 2% ~ (p=0.548 n=5+5) IndexRuneASCII/10-12 2.57GB/s ± 0% 2.58GB/s ± 0% +0.63% (p=0.008 n=5+5) IndexRuneASCII/32-12 6.00GB/s ± 0% 6.30GB/s ± 1% +4.98% (p=0.008 n=5+5) IndexRuneASCII/4K-12 56.7GB/s ± 0% 56.8GB/s ± 1% ~ (p=0.151 n=5+5) IndexRuneASCII/4M-12 41.6GB/s ± 1% 41.7GB/s ± 2% ~ (p=0.151 n=5+5) IndexRuneASCII/64M-12 17.7GB/s ± 1% 17.6GB/s ± 1% ~ (p=0.222 n=5+5) Index/10-12 1.06GB/s ± 1% 1.06GB/s ± 0% ~ (p=0.310 n=5+5) Index/32-12 3.57GB/s ± 0% 3.56GB/s ± 1% ~ (p=0.056 n=5+5) Index/4K-12 1.02GB/s ± 2% 1.03GB/s ± 0% ~ (p=0.690 n=5+5) Index/4M-12 1.04GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=4+5) Index/64M-12 1.02GB/s ± 0% 1.02GB/s ± 0% ~ (p=0.905 n=5+4) IndexEasy/10-12 1.12GB/s ± 2% 1.15GB/s ± 1% +3.10% (p=0.008 n=5+5) IndexEasy/32-12 3.14GB/s ± 2% 3.13GB/s ± 1% ~ (p=0.310 n=5+5) IndexEasy/4K-12 47.6GB/s ± 1% 47.7GB/s ± 2% ~ (p=0.310 n=5+5) IndexEasy/4M-12 36.4GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.690 n=5+5) IndexEasy/64M-12 16.1GB/s ± 1% 16.4GB/s ± 5% ~ (p=0.151 n=5+5) [Geo mean] 6.39GB/s 6.46GB/s +1.11% Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82 Reviewed-on: https://go-review.googlesource.com/c/go/+/397576 Reviewed-by: Keith Randall Run-TryBot: Keith Randall TryBot-Result: Gopher Robot Reviewed-by: Michael Knyszek --- diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s index 4ccaca5e87..fdd015f560 100644 --- a/src/internal/bytealg/compare_amd64.s +++ b/src/internal/bytealg/compare_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Compare(SB),NOSPLIT,$0-56 @@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0 CMPQ R8, $63 JBE loop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JEQ big_loop_avx2 JMP big_loop +#else + JMP big_loop_avx2 +#endif loop: CMPQ R8, $16 JBE _0through16 @@ -155,6 +160,7 @@ allsame: RET // this works for >= 64 bytes of data. +#ifndef hasAVX2 big_loop: MOVOU (SI), X0 MOVOU (DI), X1 @@ -190,6 +196,7 @@ big_loop: CMPQ R8, $64 JBE loop JMP big_loop +#endif // Compare 64-bytes per loop iteration. // Loop is unrolled and uses AVX2. diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s index fa864c4c76..efb17f84b7 100644 --- a/src/internal/bytealg/count_amd64.s +++ b/src/internal/bytealg/count_amd64.s @@ -3,12 +3,15 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Count(SB),NOSPLIT,$0-40 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGeneric(SB) +#endif MOVQ b_base+0(FP), SI MOVQ b_len+8(FP), BX MOVB c+24(FP), AL @@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40 JMP countbody<>(SB) TEXT ·CountString(SB),NOSPLIT,$0-32 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGenericString(SB) +#endif MOVQ s_base+0(FP), SI MOVQ s_len+8(FP), BX MOVB c+16(FP), AL @@ -151,8 +156,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s index dd46e2e0fd..d178a33779 100644 --- a/src/internal/bytealg/equal_amd64.s +++ b/src/internal/bytealg/equal_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" // memequal(a, b unsafe.Pointer, size uintptr) bool @@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0 JB small CMPQ BX, $64 JB bigloop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JE hugeloop_avx2 @@ -76,6 +78,7 @@ hugeloop: JEQ hugeloop XORQ AX, AX // return 0 RET +#endif // 64 bytes at a time using ymm registers hugeloop_avx2: diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s index 6193b57239..04314917b8 100644 --- a/src/internal/bytealg/index_amd64.s +++ b/src/internal/bytealg/index_amd64.s @@ -233,8 +233,10 @@ success_avx2: VZEROUPPER JMP success sse42: +#ifndef hasSSE42 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 JNE no_sse42 +#endif CMPQ AX, $12 // PCMPESTRI is slower than normal compare, // so using it makes sense only if we advance 4+ bytes per compare diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s index f78093c539..1ca70e39e2 100644 --- a/src/internal/bytealg/indexbyte_amd64.s +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -115,8 +115,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h index 49e0ee2323..f7a8896db6 100644 --- a/src/runtime/asm_amd64.h +++ b/src/runtime/asm_amd64.h @@ -5,10 +5,21 @@ // Define features that are guaranteed to be supported by setting the AMD64 variable. // If a feature is supported, there's no need to check it at runtime every time. +#ifdef GOAMD64_v2 +#define hasPOPCNT +#define hasSSE42 +#endif + #ifdef GOAMD64_v3 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif #ifdef GOAMD64_v4 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 28befcbd0d..61d2d0247e 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -126,6 +126,9 @@ func header(arch string) { fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base) } fmt.Fprintf(out, "#include \"go_asm.h\"\n") + if arch == "amd64" { + fmt.Fprintf(out, "#include \"asm_amd64.h\"\n") + } fmt.Fprintf(out, "#include \"textflag.h\"\n\n") fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n") } @@ -267,8 +270,10 @@ func genAMD64() { // Clear the upper bits to get to a clean state. See issue #37174. // It is safe here as Go code don't use the upper bits of Y registers. p("#ifdef GOOS_darwin") + p("#ifndef hasAVX") p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0") p("JE 2(PC)") + p("#endif") p("VZEROUPPER") p("#endif") diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 31f7c8b66f..94a84fb74c 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -1,6 +1,7 @@ // Code generated by mkpreempt.go; DO NOT EDIT. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 @@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ R14, 96(SP) MOVQ R15, 104(SP) #ifdef GOOS_darwin + #ifndef hasAVX CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0 JE 2(PC) + #endif VZEROUPPER #endif MOVUPS X0, 112(SP)