The previous CALLFN macro was copying a single byte at a
time which is extremely inefficient on ppc64x. This changes
the macro so it copies 8 bytes at a time.
benchmark in reflect:
name old time/op new time/op delta
Call-8 177ns ± 0% 165ns ± 0% -6.78% (p=1.000 n=1+1)
CallArgCopy/size=128-8 194ns ± 0% 140ns ± 0% -27.84% (p=1.000 n=1+1)
CallArgCopy/size=256-8 253ns ± 0% 159ns ± 0% -37.15% (p=1.000 n=1+1)
CallArgCopy/size=1024-8 612ns ± 0% 222ns ± 0% -63.73% (p=1.000 n=1+1)
CallArgCopy/size=4096-8 2.14µs ± 0% 0.53µs ± 0% -75.01% (p=1.000 n=1+1)
CallArgCopy/size=65536-8 33.0µs ± 0% 7.3µs ± 0% -77.72% (p=1.000 n=1+1)
Change-Id: I71f6ee788264e61bb072264d21b77b83592c9dca
Reviewed-on: https://go-review.googlesource.com/134635
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: Michael Munday <mike.munday@ibm.com>
/* copy arguments to stack */ \
MOVD arg+16(FP), R3; \
MOVWZ argsize+24(FP), R4; \
- MOVD R1, R5; \
- ADD $(FIXED_FRAME-1), R5; \
- SUB $1, R3; \
- ADD R5, R4; \
- CMP R5, R4; \
- BEQ 4(PC); \
- MOVBZU 1(R3), R6; \
- MOVBZU R6, 1(R5); \
- BR -4(PC); \
+ MOVD R1, R5; \
+ CMP R4, $8; \
+ BLT tailsetup; \
+ /* copy 8 at a time if possible */ \
+ ADD $(FIXED_FRAME-8), R5; \
+ SUB $8, R3; \
+top: \
+ MOVDU 8(R3), R7; \
+ MOVDU R7, 8(R5); \
+ SUB $8, R4; \
+ CMP R4, $8; \
+ BGE top; \
+ /* handle remaining bytes */ \
+ CMP $0, R4; \
+ BEQ callfn; \
+ ADD $7, R3; \
+ ADD $7, R5; \
+ BR tail; \
+tailsetup: \
+ CMP $0, R4; \
+ BEQ callfn; \
+ ADD $(FIXED_FRAME-1), R5; \
+ SUB $1, R3; \
+tail: \
+ MOVBU 1(R3), R6; \
+ MOVBU R6, 1(R5); \
+ SUB $1, R4; \
+ CMP $0, R4; \
+ BGT tail; \
+callfn: \
/* call function */ \
MOVD f+8(FP), R11; \
MOVD (R11), R12; \