From a712e20a1d1587e115c1295df0850120cdfa3e44 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Mon, 23 Jun 2014 12:06:26 -0700 Subject: [PATCH] runtime: speed up amd64 memmove MOV with SSE registers seems faster than REP MOVSQ if the size being copied is less than about 2K. Previously we didn't use MOV if the memory region is larger than 256 byte. This patch improves the performance of 257 ~ 2048 byte non-overlapping copy by using MOV. Here is the benchmark result on Intel Xeon 3.5GHz (Nehalem). benchmark old ns/op new ns/op delta BenchmarkMemmove16 4 4 +0.42% BenchmarkMemmove32 5 5 -0.20% BenchmarkMemmove64 6 6 -0.81% BenchmarkMemmove128 7 7 -0.82% BenchmarkMemmove256 10 10 +1.92% BenchmarkMemmove512 29 16 -44.90% BenchmarkMemmove1024 37 25 -31.55% BenchmarkMemmove2048 55 44 -19.46% BenchmarkMemmove4096 92 91 -0.76% benchmark old MB/s new MB/s speedup BenchmarkMemmove16 3370.61 3356.88 1.00x BenchmarkMemmove32 6368.68 6386.99 1.00x BenchmarkMemmove64 10367.37 10462.62 1.01x BenchmarkMemmove128 17551.16 17713.48 1.01x BenchmarkMemmove256 24692.81 24142.99 0.98x BenchmarkMemmove512 17428.70 31687.72 1.82x BenchmarkMemmove1024 27401.82 40009.45 1.46x BenchmarkMemmove2048 36884.86 45766.98 1.24x BenchmarkMemmove4096 44295.91 44627.86 1.01x LGTM=khr R=golang-codereviews, gobot, khr CC=golang-codereviews https://golang.org/cl/90500043 --- src/pkg/runtime/memmove_amd64.s | 52 ++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/pkg/runtime/memmove_amd64.s b/src/pkg/runtime/memmove_amd64.s index 5895846db6..7e384bd58d 100644 --- a/src/pkg/runtime/memmove_amd64.s +++ b/src/pkg/runtime/memmove_amd64.s @@ -36,10 +36,13 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24 // REP instructions have a high startup cost, so we handle small sizes // with some straightline code. The REP MOVSQ instruction is really fast - // for large sizes. The cutover is approximately 1K. We implement up to - // 256 because that is the maximum SSE register load (loading all data - // into registers lets us ignore copy direction). + // for large sizes. The cutover is approximately 2K. tail: + // move_129through256 or smaller work whether or not the source and the + // destination memory regions overlap because they load all data into + // registers before writing it back. move_256through2048 on the other + // hand can be used only when the memory regions don't overlap or the copy + // direction is forward. TESTQ BX, BX JEQ move_0 CMPQ BX, $2 @@ -70,10 +73,12 @@ tail: * forward copy loop */ forward: + CMPQ BX, $2048 + JLS move_256through2048 + MOVQ BX, CX SHRQ $3, CX ANDQ $7, BX - REP; MOVSQ JMP tail @@ -205,3 +210,42 @@ move_129through256: MOVOU X14, -32(DI)(BX*1) MOVOU X15, -16(DI)(BX*1) RET +move_256through2048: + SUBQ $256, BX + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, 128(DI) + MOVOU X9, 144(DI) + MOVOU X10, 160(DI) + MOVOU X11, 176(DI) + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + CMPQ BX, $256 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + JGE move_256through2048 + JMP tail -- 2.48.1