MOV with SSE registers seems faster than REP MOVSQ if the
size being copied is less than about 2K. Previously we
didn't use MOV if the memory region is larger than 256
byte. This patch improves the performance of 257 ~ 2048
byte non-overlapping copy by using MOV.
Here is the benchmark result on Intel Xeon 3.5GHz (Nehalem).
benchmark old ns/op new ns/op delta
BenchmarkMemmove16 4 4 +0.42%
BenchmarkMemmove32 5 5 -0.20%
BenchmarkMemmove64 6 6 -0.81%
BenchmarkMemmove128 7 7 -0.82%
BenchmarkMemmove256 10 10 +1.92%
BenchmarkMemmove512 29 16 -44.90%
BenchmarkMemmove1024 37 25 -31.55%
BenchmarkMemmove2048 55 44 -19.46%
BenchmarkMemmove4096 92 91 -0.76%
benchmark old MB/s new MB/s speedup
BenchmarkMemmove16 3370.61 3356.88 1.00x
BenchmarkMemmove32 6368.68 6386.99 1.00x
BenchmarkMemmove64 10367.37 10462.62 1.01x
BenchmarkMemmove128 17551.16 17713.48 1.01x
BenchmarkMemmove256 24692.81 24142.99 0.98x
BenchmarkMemmove512 17428.70 31687.72 1.82x
BenchmarkMemmove1024 27401.82 40009.45 1.46x
BenchmarkMemmove2048 36884.86 45766.98 1.24x
BenchmarkMemmove4096 44295.91 44627.86 1.01x
LGTM=khr
R=golang-codereviews, gobot, khr
CC=golang-codereviews
https://golang.org/cl/
90500043
// REP instructions have a high startup cost, so we handle small sizes
// with some straightline code. The REP MOVSQ instruction is really fast
- // for large sizes. The cutover is approximately 1K. We implement up to
- // 256 because that is the maximum SSE register load (loading all data
- // into registers lets us ignore copy direction).
+ // for large sizes. The cutover is approximately 2K.
tail:
+ // move_129through256 or smaller work whether or not the source and the
+ // destination memory regions overlap because they load all data into
+ // registers before writing it back. move_256through2048 on the other
+ // hand can be used only when the memory regions don't overlap or the copy
+ // direction is forward.
TESTQ BX, BX
JEQ move_0
CMPQ BX, $2
* forward copy loop
*/
forward:
+ CMPQ BX, $2048
+ JLS move_256through2048
+
MOVQ BX, CX
SHRQ $3, CX
ANDQ $7, BX
-
REP; MOVSQ
JMP tail
MOVOU X14, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1)
RET
+move_256through2048:
+ SUBQ $256, BX
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU 32(SI), X2
+ MOVOU 48(SI), X3
+ MOVOU 64(SI), X4
+ MOVOU 80(SI), X5
+ MOVOU 96(SI), X6
+ MOVOU 112(SI), X7
+ MOVOU 128(SI), X8
+ MOVOU 144(SI), X9
+ MOVOU 160(SI), X10
+ MOVOU 176(SI), X11
+ MOVOU 192(SI), X12
+ MOVOU 208(SI), X13
+ MOVOU 224(SI), X14
+ MOVOU 240(SI), X15
+ MOVOU X0, (DI)
+ MOVOU X1, 16(DI)
+ MOVOU X2, 32(DI)
+ MOVOU X3, 48(DI)
+ MOVOU X4, 64(DI)
+ MOVOU X5, 80(DI)
+ MOVOU X6, 96(DI)
+ MOVOU X7, 112(DI)
+ MOVOU X8, 128(DI)
+ MOVOU X9, 144(DI)
+ MOVOU X10, 160(DI)
+ MOVOU X11, 176(DI)
+ MOVOU X12, 192(DI)
+ MOVOU X13, 208(DI)
+ MOVOU X14, 224(DI)
+ MOVOU X15, 240(DI)
+ CMPQ BX, $256
+ LEAQ 256(SI), SI
+ LEAQ 256(DI), DI
+ JGE move_256through2048
+ JMP tail