]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: remove dead code and unnecessary checks for amd64
authorvpachkov <slava.pach@gmail.com>
Fri, 1 Apr 2022 17:37:30 +0000 (20:37 +0300)
committerKeith Randall <khr@golang.org>
Thu, 18 Aug 2022 17:17:01 +0000 (17:17 +0000)
Use amd64 assembly header to remove unnecessary cpu flags checks
and dead code that is guaranteed to not be executed when compiling
for specific microarchitectures.

name                  old time/op  new time/op  delta
BytesCompare/1-12     3.88ns ± 1%  3.18ns ± 1%  -18.15%  (p=0.008 n=5+5)
BytesCompare/2-12     3.89ns ± 1%  3.21ns ± 2%  -17.66%  (p=0.008 n=5+5)
BytesCompare/4-12     3.89ns ± 0%  3.17ns ± 0%  -18.62%  (p=0.008 n=5+5)
BytesCompare/8-12     3.44ns ± 2%  3.39ns ± 1%   -1.36%  (p=0.008 n=5+5)
BytesCompare/16-12    3.40ns ± 1%  3.14ns ± 0%   -7.77%  (p=0.008 n=5+5)
BytesCompare/32-12    3.90ns ± 1%  3.65ns ± 0%   -6.19%  (p=0.008 n=5+5)
BytesCompare/64-12    4.96ns ± 1%  4.71ns ± 2%   -4.98%  (p=0.008 n=5+5)
BytesCompare/128-12   6.42ns ± 0%  5.99ns ± 4%   -6.75%  (p=0.008 n=5+5)
BytesCompare/256-12   9.36ns ± 0%  7.40ns ± 0%  -20.97%  (p=0.008 n=5+5)
BytesCompare/512-12   15.9ns ± 1%  11.4ns ± 1%  -28.36%  (p=0.008 n=5+5)
BytesCompare/1024-12  27.0ns ± 0%  19.3ns ± 0%  -28.36%  (p=0.008 n=5+5)
BytesCompare/2048-12  50.2ns ± 0%  43.3ns ± 0%  -13.71%  (p=0.008 n=5+5)
[Geo mean]            7.13ns       6.07ns       -14.86%

name                old speed      new speed      delta
Count/10-12          723MB/s ± 0%   704MB/s ± 1%  -2.73%  (p=0.008 n=5+5)
Count/32-12         2.21GB/s ± 0%  2.12GB/s ± 2%  -4.21%  (p=0.008 n=5+5)
Count/4K-12         1.03GB/s ± 0%  1.03GB/s ± 1%    ~     (p=1.000 n=5+5)
Count/4M-12         1.04GB/s ± 0%  1.02GB/s ± 2%    ~     (p=0.310 n=5+5)
Count/64M-12        1.02GB/s ± 0%  1.01GB/s ± 1%  -1.00%  (p=0.016 n=5+5)
CountEasy/10-12      779MB/s ± 0%   768MB/s ± 1%  -1.48%  (p=0.008 n=5+5)
CountEasy/32-12     2.15GB/s ± 0%  2.09GB/s ± 1%  -2.71%  (p=0.008 n=5+5)
CountEasy/4K-12     45.1GB/s ± 1%  45.2GB/s ± 1%    ~     (p=0.421 n=5+5)
CountEasy/4M-12     36.4GB/s ± 1%  36.5GB/s ± 1%    ~     (p=0.690 n=5+5)
CountEasy/64M-12    16.1GB/s ± 2%  16.4GB/s ± 1%    ~     (p=0.056 n=5+5)
CountSingle/10-12   2.15GB/s ± 2%  2.22GB/s ± 1%  +3.37%  (p=0.008 n=5+5)
CountSingle/32-12   5.86GB/s ± 1%  5.76GB/s ± 1%  -1.55%  (p=0.008 n=5+5)
CountSingle/4K-12   54.6GB/s ± 1%  55.0GB/s ± 1%    ~     (p=0.548 n=5+5)
CountSingle/4M-12   45.9GB/s ± 4%  46.4GB/s ± 2%    ~     (p=0.548 n=5+5)
CountSingle/64M-12  17.3GB/s ± 1%  17.2GB/s ± 2%    ~     (p=1.000 n=5+5)
[Geo mean]          5.11GB/s       5.08GB/s       -0.53%

name          old speed      new speed      delta
Equal/1-12     200MB/s ± 0%   188MB/s ± 1%   -6.11%  (p=0.008 n=5+5)
Equal/6-12    1.20GB/s ± 0%  1.13GB/s ± 1%   -6.38%  (p=0.008 n=5+5)
Equal/9-12    1.67GB/s ± 3%  1.74GB/s ± 1%   +3.83%  (p=0.008 n=5+5)
Equal/15-12   2.82GB/s ± 1%  2.89GB/s ± 1%   +2.63%  (p=0.008 n=5+5)
Equal/16-12   2.96GB/s ± 1%  3.08GB/s ± 1%   +3.95%  (p=0.008 n=5+5)
Equal/20-12   3.33GB/s ± 1%  3.54GB/s ± 1%   +6.36%  (p=0.008 n=5+5)
Equal/32-12   4.57GB/s ± 0%  5.26GB/s ± 1%  +15.09%  (p=0.008 n=5+5)
Equal/4K-12   62.0GB/s ± 1%  65.9GB/s ± 2%   +6.29%  (p=0.008 n=5+5)
Equal/4M-12   23.6GB/s ± 2%  24.8GB/s ± 4%   +5.43%  (p=0.008 n=5+5)
Equal/64M-12  11.1GB/s ± 2%  11.3GB/s ± 1%   +1.69%  (p=0.008 n=5+5)
[Geo mean]    3.91GB/s       4.03GB/s        +3.11%

name                              old speed      new speed      delta
IndexByte/10-12                   2.64GB/s ± 0%  2.69GB/s ± 0%   +1.67%  (p=0.008 n=5+5)
IndexByte/32-12                   6.79GB/s ± 0%  6.27GB/s ± 0%   -7.57%  (p=0.008 n=5+5)
IndexByte/4K-12                   56.2GB/s ± 0%  56.9GB/s ± 0%   +1.27%  (p=0.008 n=5+5)
IndexByte/4M-12                   40.1GB/s ± 1%  41.7GB/s ± 1%   +4.05%  (p=0.008 n=5+5)
IndexByte/64M-12                  17.5GB/s ± 0%  17.7GB/s ± 1%     ~     (p=0.095 n=5+5)
IndexBytePortable/10-12           2.06GB/s ± 1%  2.16GB/s ± 1%   +5.08%  (p=0.008 n=5+5)
IndexBytePortable/32-12           1.40GB/s ± 1%  1.54GB/s ± 1%  +10.05%  (p=0.008 n=5+5)
IndexBytePortable/4K-12           3.99GB/s ± 0%  4.08GB/s ± 0%   +2.16%  (p=0.008 n=5+5)
IndexBytePortable/4M-12           4.05GB/s ± 1%  4.08GB/s ± 2%     ~     (p=0.095 n=5+5)
IndexBytePortable/64M-12          3.80GB/s ± 1%  3.81GB/s ± 0%     ~     (p=0.421 n=5+5)
IndexRune/10-12                    746MB/s ± 1%   752MB/s ± 0%   +0.85%  (p=0.008 n=5+5)
IndexRune/32-12                   2.33GB/s ± 0%  2.42GB/s ± 0%   +3.66%  (p=0.008 n=5+5)
IndexRune/4K-12                   44.4GB/s ± 0%  44.2GB/s ± 0%     ~     (p=0.095 n=5+5)
IndexRune/4M-12                   36.2GB/s ± 1%  36.3GB/s ± 2%     ~     (p=0.841 n=5+5)
IndexRune/64M-12                  16.2GB/s ± 2%  16.3GB/s ± 2%     ~     (p=0.548 n=5+5)
IndexRuneASCII/10-12              2.57GB/s ± 0%  2.58GB/s ± 0%   +0.63%  (p=0.008 n=5+5)
IndexRuneASCII/32-12              6.00GB/s ± 0%  6.30GB/s ± 1%   +4.98%  (p=0.008 n=5+5)
IndexRuneASCII/4K-12              56.7GB/s ± 0%  56.8GB/s ± 1%     ~     (p=0.151 n=5+5)
IndexRuneASCII/4M-12              41.6GB/s ± 1%  41.7GB/s ± 2%     ~     (p=0.151 n=5+5)
IndexRuneASCII/64M-12             17.7GB/s ± 1%  17.6GB/s ± 1%     ~     (p=0.222 n=5+5)
Index/10-12                       1.06GB/s ± 1%  1.06GB/s ± 0%     ~     (p=0.310 n=5+5)
Index/32-12                       3.57GB/s ± 0%  3.56GB/s ± 1%     ~     (p=0.056 n=5+5)
Index/4K-12                       1.02GB/s ± 2%  1.03GB/s ± 0%     ~     (p=0.690 n=5+5)
Index/4M-12                       1.04GB/s ± 0%  1.03GB/s ± 1%     ~     (p=1.000 n=4+5)
Index/64M-12                      1.02GB/s ± 0%  1.02GB/s ± 0%     ~     (p=0.905 n=5+4)
IndexEasy/10-12                   1.12GB/s ± 2%  1.15GB/s ± 1%   +3.10%  (p=0.008 n=5+5)
IndexEasy/32-12                   3.14GB/s ± 2%  3.13GB/s ± 1%     ~     (p=0.310 n=5+5)
IndexEasy/4K-12                   47.6GB/s ± 1%  47.7GB/s ± 2%     ~     (p=0.310 n=5+5)
IndexEasy/4M-12                   36.4GB/s ± 1%  36.3GB/s ± 2%     ~     (p=0.690 n=5+5)
IndexEasy/64M-12                  16.1GB/s ± 1%  16.4GB/s ± 5%     ~     (p=0.151 n=5+5)
[Geo mean]                        6.39GB/s       6.46GB/s        +1.11%

Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82
Reviewed-on: https://go-review.googlesource.com/c/go/+/397576
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/internal/bytealg/compare_amd64.s
src/internal/bytealg/count_amd64.s
src/internal/bytealg/equal_amd64.s
src/internal/bytealg/index_amd64.s
src/internal/bytealg/indexbyte_amd64.s
src/runtime/asm_amd64.h
src/runtime/mkpreempt.go
src/runtime/preempt_amd64.s

index 4ccaca5e87b23b397078d20ed8b413773469d2d5..fdd015f560b79ffb2fb8af6e5fa093a9cd3235e6 100644 (file)
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
@@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
 
        CMPQ    R8, $63
        JBE     loop
+#ifndef hasAVX2
        CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
        JEQ     big_loop_avx2
        JMP     big_loop
+#else
+       JMP     big_loop_avx2
+#endif
 loop:
        CMPQ    R8, $16
        JBE     _0through16
@@ -155,6 +160,7 @@ allsame:
        RET
 
        // this works for >= 64 bytes of data.
+#ifndef hasAVX2
 big_loop:
        MOVOU   (SI), X0
        MOVOU   (DI), X1
@@ -190,6 +196,7 @@ big_loop:
        CMPQ    R8, $64
        JBE     loop
        JMP     big_loop
+#endif
 
        // Compare 64-bytes per loop iteration.
        // Loop is unrolled and uses AVX2.
index fa864c4c76631d8d83b89f92c913ffd55d1430ee..efb17f84b776a083066f52777eb55a7adb132c4e 100644 (file)
@@ -3,12 +3,15 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
        CMPB    internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
        JEQ     2(PC)
        JMP     ·countGeneric(SB)
+#endif
        MOVQ    b_base+0(FP), SI
        MOVQ    b_len+8(FP), BX
        MOVB    c+24(FP), AL
@@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40
        JMP     countbody<>(SB)
 
 TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
        CMPB    internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
        JEQ     2(PC)
        JMP     ·countGenericString(SB)
+#endif
        MOVQ    s_base+0(FP), SI
        MOVQ    s_len+8(FP), BX
        MOVB    c+16(FP), AL
@@ -151,8 +156,10 @@ endofpage:
        RET
 
 avx2:
+#ifndef hasAVX2
        CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
        JNE sse
+#endif
        MOVD AX, X0
        LEAQ -32(SI)(BX*1), R11
        VPBROADCASTB  X0, Y1
index dd46e2e0fdfb0ea32485f4660ecfa66197b764db..d178a3377938d19e228cbead7d0d600d0458c339 100644 (file)
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 // memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
        JB      small
        CMPQ    BX, $64
        JB      bigloop
+#ifndef hasAVX2
        CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
        JE      hugeloop_avx2
 
@@ -76,6 +78,7 @@ hugeloop:
        JEQ     hugeloop
        XORQ    AX, AX  // return 0
        RET
+#endif
 
        // 64 bytes at a time using ymm registers
 hugeloop_avx2:
index 6193b572393a5829221682784fba697728cbdbc7..04314917b8918441df9baf13a0e7da0af57c4f1d 100644 (file)
@@ -233,8 +233,10 @@ success_avx2:
        VZEROUPPER
        JMP success
 sse42:
+#ifndef hasSSE42
        CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
        JNE no_sse42
+#endif
        CMPQ AX, $12
        // PCMPESTRI is slower than normal compare,
        // so using it makes sense only if we advance 4+ bytes per compare
index f78093c539013e15af1813daf2256f3f19bc7d6a..1ca70e39e23a79240b08c5cfeb46500f7e98cf74 100644 (file)
@@ -115,8 +115,10 @@ endofpage:
        RET
 
 avx2:
+#ifndef hasAVX2
        CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
        JNE sse
+#endif
        MOVD AX, X0
        LEAQ -32(SI)(BX*1), R11
        VPBROADCASTB  X0, Y1
index 49e0ee2323f94c0d6c64333815d70a45d91ea0e6..f7a8896db63348abe034a49b0cb02353dddc421a 100644 (file)
@@ -5,10 +5,21 @@
 // Define features that are guaranteed to be supported by setting the AMD64 variable.
 // If a feature is supported, there's no need to check it at runtime every time.
 
+#ifdef GOAMD64_v2
+#define hasPOPCNT
+#define hasSSE42
+#endif
+
 #ifdef GOAMD64_v3
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
 
 #ifdef GOAMD64_v4
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
index 28befcbd0dd460957d08bbc3b0e2d9fd6407d536..61d2d0247e80383b992c053547dfa38e8a8d3395 100644 (file)
@@ -126,6 +126,9 @@ func header(arch string) {
                fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
        }
        fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+       if arch == "amd64" {
+               fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
+       }
        fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
        fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
@@ -267,8 +270,10 @@ func genAMD64() {
        // Clear the upper bits to get to a clean state. See issue #37174.
        // It is safe here as Go code don't use the upper bits of Y registers.
        p("#ifdef GOOS_darwin")
+       p("#ifndef hasAVX")
        p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0")
        p("JE 2(PC)")
+       p("#endif")
        p("VZEROUPPER")
        p("#endif")
 
index 31f7c8b66f502b6cbeccc90b7c695214614eb6b2..94a84fb74cafa752a4991360600fb3022ce9bf84 100644 (file)
@@ -1,6 +1,7 @@
 // Code generated by mkpreempt.go; DO NOT EDIT.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
@@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
        MOVQ R14, 96(SP)
        MOVQ R15, 104(SP)
        #ifdef GOOS_darwin
+       #ifndef hasAVX
        CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
        JE 2(PC)
+       #endif
        VZEROUPPER
        #endif
        MOVUPS X0, 112(SP)