]> Cypherpunks repositories - gostls13.git/commitdiff
reflect: optimize CALLFN wrapper for arm64
authorWei Xiao <Wei.Xiao@arm.com>
Fri, 7 Jul 2017 02:27:57 +0000 (10:27 +0800)
committerCherry Zhang <cherryyz@google.com>
Tue, 17 Oct 2017 12:55:17 +0000 (12:55 +0000)
Optimize arm64 CALLFN wrapper with LDP/STP instructions.
This provides a significant speedup for big argument copy.
Benchmark results for reflect:

name                      old time/op    new time/op     delta
Call-8                      79.0ns ± 4%     73.6ns ± 4%    -6.78%  (p=0.000 n=10+10)
CallArgCopy/size=128-8      80.5ns ± 0%     60.3ns ± 0%   -25.06%  (p=0.000 n=10+9)
CallArgCopy/size=256-8       119ns ± 2%       67ns ± 1%   -43.59%  (p=0.000 n=8+10)
CallArgCopy/size=1024-8      524ns ± 1%       99ns ± 1%   -81.03%  (p=0.000 n=10+10)
CallArgCopy/size=4096-8      837ns ± 0%      231ns ± 1%   -72.42%  (p=0.000 n=9+9)
CallArgCopy/size=65536-8    13.6µs ± 6%      3.1µs ± 1%   -77.38%  (p=0.000 n=10+10)
PtrTo-8                     12.9ns ± 0%     13.1ns ± 3%    +1.86%  (p=0.000 n=10+10)
FieldByName1-8              28.7ns ± 2%     28.6ns ± 2%      ~     (p=0.408 n=9+10)
FieldByName2-8               928ns ± 4%      946ns ± 8%      ~     (p=0.326 n=9+10)
FieldByName3-8              5.35µs ± 5%     5.32µs ± 5%      ~     (p=0.755 n=10+10)
InterfaceBig-8              2.57ns ± 0%     2.57ns ± 0%      ~     (all equal)
InterfaceSmall-8            2.57ns ± 0%     2.57ns ± 0%      ~     (all equal)
New-8                       9.09ns ± 1%     8.83ns ± 1%    -2.81%  (p=0.000 n=10+9)

name                      old alloc/op   new alloc/op    delta
Call-8                       0.00B           0.00B           ~     (all equal)

name                      old allocs/op  new allocs/op   delta
Call-8                        0.00            0.00           ~     (all equal)

name                      old speed      new speed       delta
CallArgCopy/size=128-8    1.59GB/s ± 0%   2.12GB/s ± 1%   +33.46%  (p=0.000 n=10+9)
CallArgCopy/size=256-8    2.14GB/s ± 2%   3.81GB/s ± 1%   +78.02%  (p=0.000 n=8+10)
CallArgCopy/size=1024-8   1.95GB/s ± 1%  10.30GB/s ± 0%  +427.99%  (p=0.000 n=10+9)
CallArgCopy/size=4096-8   4.89GB/s ± 0%  17.69GB/s ± 1%  +261.87%  (p=0.000 n=9+9)
CallArgCopy/size=65536-8  4.84GB/s ± 6%  21.36GB/s ± 1%  +341.67%  (p=0.000 n=10+10)

Change-Id: I775d88b30c43cb2eda1d0612ac15e6d283e70beb
Reviewed-on: https://go-review.googlesource.com/70570
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/runtime/asm_arm64.s

index 64311be4795de84c3f164b123b20cb6165edcfbd..8f2e03c7ef3c5f14bd667e4fb293f70a47f93949 100644 (file)
@@ -368,16 +368,26 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-24;              \
        NO_LOCAL_POINTERS;                      \
        /* copy arguments to stack */           \
        MOVD    arg+16(FP), R3;                 \
-       MOVWU   argsize+24(FP), R4;                     \
-       MOVD    RSP, R5;                                \
-       ADD     $(8-1), R5;                     \
-       SUB     $1, R3;                         \
-       ADD     R5, R4;                         \
-       CMP     R5, R4;                         \
-       BEQ     4(PC);                          \
-       MOVBU.W 1(R3), R6;                      \
-       MOVBU.W R6, 1(R5);                      \
-       B       -4(PC);                         \
+       MOVWU   argsize+24(FP), R4;             \
+       ADD     $8, RSP, R5;                    \
+       BIC     $0xf, R4, R6;                   \
+       CBZ     R6, 6(PC);                      \
+       /* if R6=(argsize&~15) != 0 */          \
+       ADD     R6, R5, R6;                     \
+       /* copy 16 bytes a time */              \
+       LDP.P   16(R3), (R7, R8);               \
+       STP.P   (R7, R8), 16(R5);               \
+       CMP     R5, R6;                         \
+       BNE     -3(PC);                         \
+       AND     $0xf, R4, R6;                   \
+       CBZ     R6, 6(PC);                      \
+       /* if R6=(argsize&15) != 0 */           \
+       ADD     R6, R5, R6;                     \
+       /* copy 1 byte a time for the rest */   \
+       MOVBU.P 1(R3), R7;                      \
+       MOVBU.P R7, 1(R5);                      \
+       CMP     R5, R6;                         \
+       BNE     -3(PC);                         \
        /* call function */                     \
        MOVD    f+8(FP), R26;                   \
        MOVD    (R26), R0;                      \