From: Keith Randall Date: Fri, 17 May 2013 19:53:49 +0000 (-0700) Subject: runtime: faster x86 memmove (a.k.a. built-in copy()) X-Git-Tag: go1.2rc2~1485 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6021449236c8ef46a6c78518470d0355b56943f3;p=gostls13.git runtime: faster x86 memmove (a.k.a. built-in copy()) REP instructions have a high startup cost, so we handle small sizes with some straightline code. The REP MOVSx instructions are really fast for large sizes. The cutover is approximately 1K. We implement up to 128/256 because that is the maximum SSE register load (loading all data into registers before any stores lets us ignore copy direction). (on a Sandy Bridge E5-1650 @ 3.20GHz) benchmark old ns/op new ns/op delta BenchmarkMemmove0 3 3 +0.86% BenchmarkMemmove1 5 5 +5.40% BenchmarkMemmove2 18 8 -56.84% BenchmarkMemmove3 18 7 -58.45% BenchmarkMemmove4 36 7 -78.63% BenchmarkMemmove5 36 8 -77.91% BenchmarkMemmove6 36 8 -77.76% BenchmarkMemmove7 36 8 -77.82% BenchmarkMemmove8 18 8 -56.33% BenchmarkMemmove9 18 7 -58.34% BenchmarkMemmove10 18 7 -58.34% BenchmarkMemmove11 18 7 -58.45% BenchmarkMemmove12 36 7 -78.51% BenchmarkMemmove13 36 7 -78.48% BenchmarkMemmove14 36 7 -78.56% BenchmarkMemmove15 36 7 -78.56% BenchmarkMemmove16 18 7 -58.24% BenchmarkMemmove32 18 8 -54.33% BenchmarkMemmove64 18 8 -53.37% BenchmarkMemmove128 20 9 -55.93% BenchmarkMemmove256 25 11 -55.16% BenchmarkMemmove512 33 33 -1.19% BenchmarkMemmove1024 43 44 +2.06% BenchmarkMemmove2048 61 61 +0.16% BenchmarkMemmove4096 95 95 +0.00% R=golang-dev, bradfitz, remyoudompheng, khr, iant, dominik.honnef CC=golang-dev https://golang.org/cl/9038048 --- diff --git a/src/pkg/runtime/memmove_386.s b/src/pkg/runtime/memmove_386.s index 203a8187c0..37c66b098b 100644 --- a/src/pkg/runtime/memmove_386.s +++ b/src/pkg/runtime/memmove_386.s @@ -27,6 +27,34 @@ TEXT runtime·memmove(SB), 7, $0 MOVL to+0(FP), DI MOVL fr+4(FP), SI MOVL n+8(FP), BX + + // REP instructions have a high startup cost, so we handle small sizes + // with some straightline code. The REP MOVSL instruction is really fast + // for large sizes. The cutover is approximately 1K. We implement up to + // 128 because that is the maximum SSE register load (loading all data + // into registers lets us ignore copy direction). +tail: + TESTL BX, BX + JEQ move_0 + CMPL BX, $2 + JBE move_1or2 + CMPL BX, $4 + JBE move_3or4 + CMPL BX, $8 + JBE move_5through8 + CMPL BX, $16 + JBE move_9through16 + TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 + JEQ nosse2 + CMPL BX, $32 + JBE move_17through32 + CMPL BX, $64 + JBE move_33through64 + CMPL BX, $128 + JBE move_65through128 + // TODO: use branch table and BSR to make this just a single dispatch + +nosse2: /* * check and set for backwards */ @@ -42,11 +70,7 @@ forward: ANDL $3, BX REP; MOVSL - MOVL BX, CX - REP; MOVSB - - MOVL to+0(FP),AX - RET + JMP tail /* * check overlap */ @@ -75,12 +99,73 @@ back: SUBL $4, SI REP; MOVSL - ADDL $3, DI - ADDL $3, SI - MOVL BX, CX - REP; MOVSB - CLD - MOVL to+0(FP),AX - RET + ADDL $4, DI + ADDL $4, SI + SUBL BX, DI + SUBL BX, SI + JMP tail +move_1or2: + MOVB (SI), AX + MOVB -1(SI)(BX*1), CX + MOVB AX, (DI) + MOVB CX, -1(DI)(BX*1) +move_0: + RET +move_3or4: + MOVW (SI), AX + MOVW -2(SI)(BX*1), CX + MOVW AX, (DI) + MOVW CX, -2(DI)(BX*1) + RET +move_5through8: + MOVL (SI), AX + MOVL -4(SI)(BX*1), CX + MOVL AX, (DI) + MOVL CX, -4(DI)(BX*1) + RET +move_9through16: + MOVL (SI), AX + MOVL 4(SI), CX + MOVL -8(SI)(BX*1), DX + MOVL -4(SI)(BX*1), BP + MOVL AX, (DI) + MOVL CX, 4(DI) + MOVL DX, -8(DI)(BX*1) + MOVL BP, -4(DI)(BX*1) + RET +move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(BX*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(BX*1) + RET +move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(BX*1), X2 + MOVOU -16(SI)(BX*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(BX*1) + MOVOU X3, -16(DI)(BX*1) + RET +move_65through128: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(BX*1), X4 + MOVOU -48(SI)(BX*1), X5 + MOVOU -32(SI)(BX*1), X6 + MOVOU -16(SI)(BX*1), X7 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, -64(DI)(BX*1) + MOVOU X5, -48(DI)(BX*1) + MOVOU X6, -32(DI)(BX*1) + MOVOU X7, -16(DI)(BX*1) + RET diff --git a/src/pkg/runtime/memmove_amd64.s b/src/pkg/runtime/memmove_amd64.s index 6174407e33..837faa182b 100644 --- a/src/pkg/runtime/memmove_amd64.s +++ b/src/pkg/runtime/memmove_amd64.s @@ -30,6 +30,32 @@ TEXT runtime·memmove(SB), 7, $0 MOVQ fr+8(FP), SI MOVQ n+16(FP), BX + // REP instructions have a high startup cost, so we handle small sizes + // with some straightline code. The REP MOVSQ instruction is really fast + // for large sizes. The cutover is approximately 1K. We implement up to + // 256 because that is the maximum SSE register load (loading all data + // into registers lets us ignore copy direction). +tail: + TESTQ BX, BX + JEQ move_0 + CMPQ BX, $2 + JBE move_1or2 + CMPQ BX, $4 + JBE move_3or4 + CMPQ BX, $8 + JBE move_5through8 + CMPQ BX, $16 + JBE move_9through16 + CMPQ BX, $32 + JBE move_17through32 + CMPQ BX, $64 + JBE move_33through64 + CMPQ BX, $128 + JBE move_65through128 + CMPQ BX, $256 + JBE move_129through256 + // TODO: use branch table and BSR to make this just a single dispatch + /* * check and set for backwards */ @@ -45,11 +71,8 @@ forward: ANDQ $7, BX REP; MOVSQ - MOVQ BX, CX - REP; MOVSB + JMP tail - MOVQ to+0(FP),AX - RET back: /* * check overlap @@ -78,12 +101,103 @@ back: SUBQ $8, SI REP; MOVSQ - ADDQ $7, DI - ADDQ $7, SI - MOVQ BX, CX - REP; MOVSB - CLD - MOVQ to+0(FP),AX - RET + ADDQ $8, DI + ADDQ $8, SI + SUBQ BX, DI + SUBQ BX, SI + JMP tail +move_1or2: + MOVB (SI), AX + MOVB -1(SI)(BX*1), CX + MOVB AX, (DI) + MOVB CX, -1(DI)(BX*1) +move_0: + RET +move_3or4: + MOVW (SI), AX + MOVW -2(SI)(BX*1), CX + MOVW AX, (DI) + MOVW CX, -2(DI)(BX*1) + RET +move_5through8: + MOVL (SI), AX + MOVL -4(SI)(BX*1), CX + MOVL AX, (DI) + MOVL CX, -4(DI)(BX*1) + RET +move_9through16: + MOVQ (SI), AX + MOVQ -8(SI)(BX*1), CX + MOVQ AX, (DI) + MOVQ CX, -8(DI)(BX*1) + RET +move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(BX*1), X1 + MOVOU X0, (DI) + MOVOU X1, -16(DI)(BX*1) + RET +move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(BX*1), X2 + MOVOU -16(SI)(BX*1), X3 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, -32(DI)(BX*1) + MOVOU X3, -16(DI)(BX*1) + RET +move_65through128: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(BX*1), X4 + MOVOU -48(SI)(BX*1), X5 + MOVOU -32(SI)(BX*1), X6 + MOVOU -16(SI)(BX*1), X7 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, -64(DI)(BX*1) + MOVOU X5, -48(DI)(BX*1) + MOVOU X6, -32(DI)(BX*1) + MOVOU X7, -16(DI)(BX*1) + RET +move_129through256: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(BX*1), X8 + MOVOU -112(SI)(BX*1), X9 + MOVOU -96(SI)(BX*1), X10 + MOVOU -80(SI)(BX*1), X11 + MOVOU -64(SI)(BX*1), X12 + MOVOU -48(SI)(BX*1), X13 + MOVOU -32(SI)(BX*1), X14 + MOVOU -16(SI)(BX*1), X15 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) + MOVOU X6, 96(DI) + MOVOU X7, 112(DI) + MOVOU X8, -128(DI)(BX*1) + MOVOU X9, -112(DI)(BX*1) + MOVOU X10, -96(DI)(BX*1) + MOVOU X11, -80(DI)(BX*1) + MOVOU X12, -64(DI)(BX*1) + MOVOU X13, -48(DI)(BX*1) + MOVOU X14, -32(DI)(BX*1) + MOVOU X15, -16(DI)(BX*1) + RET diff --git a/src/pkg/runtime/memmove_test.go b/src/pkg/runtime/memmove_test.go new file mode 100644 index 0000000000..9525f06826 --- /dev/null +++ b/src/pkg/runtime/memmove_test.go @@ -0,0 +1,116 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime_test + +import ( + "testing" +) + +func TestMemmove(t *testing.T) { + size := 256 + if testing.Short() { + size = 128 + 16 + } + src := make([]byte, size) + dst := make([]byte, size) + for i := 0; i < size; i++ { + src[i] = byte(128 + (i & 127)) + } + for i := 0; i < size; i++ { + dst[i] = byte(i & 127) + } + for n := 0; n <= size; n++ { + for x := 0; x <= size-n; x++ { // offset in src + for y := 0; y <= size-n; y++ { // offset in dst + copy(dst[y:y+n], src[x:x+n]) + for i := 0; i < y; i++ { + if dst[i] != byte(i&127) { + t.Fatalf("prefix dst[%d] = %d", i, dst[i]) + } + } + for i := y; i < y+n; i++ { + if dst[i] != byte(128+((i-y+x)&127)) { + t.Fatalf("copied dst[%d] = %d", i, dst[i]) + } + dst[i] = byte(i & 127) // reset dst + } + for i := y + n; i < size; i++ { + if dst[i] != byte(i&127) { + t.Fatalf("suffix dst[%d] = %d", i, dst[i]) + } + } + } + } + } +} + +func TestMemmoveAlias(t *testing.T) { + size := 256 + if testing.Short() { + size = 128 + 16 + } + buf := make([]byte, size) + for i := 0; i < size; i++ { + buf[i] = byte(i) + } + for n := 0; n <= size; n++ { + for x := 0; x <= size-n; x++ { // src offset + for y := 0; y <= size-n; y++ { // dst offset + copy(buf[y:y+n], buf[x:x+n]) + for i := 0; i < y; i++ { + if buf[i] != byte(i) { + t.Fatalf("prefix buf[%d] = %d", i, buf[i]) + } + } + for i := y; i < y+n; i++ { + if buf[i] != byte(i-y+x) { + t.Fatalf("copied buf[%d] = %d", i, buf[i]) + } + buf[i] = byte(i) // reset buf + } + for i := y + n; i < size; i++ { + if buf[i] != byte(i) { + t.Fatalf("suffix buf[%d] = %d", i, buf[i]) + } + } + } + } + } +} + +func bmMemmove(n int, b *testing.B) { + x := make([]byte, n) + y := make([]byte, n) + b.SetBytes(int64(n)) + for i := 0; i < b.N; i++ { + copy(x, y) + } +} + +func BenchmarkMemmove0(b *testing.B) { bmMemmove(0, b) } +func BenchmarkMemmove1(b *testing.B) { bmMemmove(1, b) } +func BenchmarkMemmove2(b *testing.B) { bmMemmove(2, b) } +func BenchmarkMemmove3(b *testing.B) { bmMemmove(3, b) } +func BenchmarkMemmove4(b *testing.B) { bmMemmove(4, b) } +func BenchmarkMemmove5(b *testing.B) { bmMemmove(5, b) } +func BenchmarkMemmove6(b *testing.B) { bmMemmove(6, b) } +func BenchmarkMemmove7(b *testing.B) { bmMemmove(7, b) } +func BenchmarkMemmove8(b *testing.B) { bmMemmove(8, b) } +func BenchmarkMemmove9(b *testing.B) { bmMemmove(9, b) } +func BenchmarkMemmove10(b *testing.B) { bmMemmove(10, b) } +func BenchmarkMemmove11(b *testing.B) { bmMemmove(11, b) } +func BenchmarkMemmove12(b *testing.B) { bmMemmove(12, b) } +func BenchmarkMemmove13(b *testing.B) { bmMemmove(13, b) } +func BenchmarkMemmove14(b *testing.B) { bmMemmove(14, b) } +func BenchmarkMemmove15(b *testing.B) { bmMemmove(15, b) } +func BenchmarkMemmove16(b *testing.B) { bmMemmove(16, b) } +func BenchmarkMemmove32(b *testing.B) { bmMemmove(32, b) } +func BenchmarkMemmove64(b *testing.B) { bmMemmove(64, b) } +func BenchmarkMemmove128(b *testing.B) { bmMemmove(128, b) } +func BenchmarkMemmove256(b *testing.B) { bmMemmove(256, b) } +func BenchmarkMemmove512(b *testing.B) { bmMemmove(512, b) } +func BenchmarkMemmove1024(b *testing.B) { bmMemmove(1024, b) } +func BenchmarkMemmove2048(b *testing.B) { bmMemmove(2048, b) } +func BenchmarkMemmove4096(b *testing.B) { bmMemmove(4096, b) }