From 6a33f7765f79cf2f00f5ca55832d2cfab8beb289 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Sun, 6 Mar 2016 16:58:30 -0800 Subject: [PATCH] runtime: use MOVSB instead of MOVSQ for unaligned moves MOVSB is quite a bit faster for unaligned moves. Possibly we should use MOVSB all of the time, but Intel folks say it might be a bit faster to use MOVSQ on some processors (but not any I have access to at the moment). benchmark old ns/op new ns/op delta BenchmarkMemmove4096-8 93.9 93.2 -0.75% BenchmarkMemmoveUnalignedDst4096-8 256 151 -41.02% BenchmarkMemmoveUnalignedSrc4096-8 175 90.5 -48.29% Fixes #14630 Change-Id: I568e6d6590eb3615e6a699fb474020596be665ff Reviewed-on: https://go-review.googlesource.com/20293 Reviewed-by: Ian Lance Taylor --- src/runtime/memmove_386.s | 14 +++++- src/runtime/memmove_amd64.s | 13 ++++++ src/runtime/memmove_test.go | 87 ++++++++++++++++++++++++++----------- 3 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s index 9a21e84136..d4baf2280a 100644 --- a/src/runtime/memmove_386.s +++ b/src/runtime/memmove_386.s @@ -69,13 +69,25 @@ nosse2: /* * forward copy loop */ -forward: +forward: + // Check alignment + MOVL SI, AX + ORL DI, AX + TESTL $3, AX + JNE unaligned_fwd + MOVL BX, CX SHRL $2, CX ANDL $3, BX REP; MOVSL JMP tail + +unaligned_fwd: + MOVL BX, CX + REP; MOVSB + RET + /* * check overlap */ diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index ae95b155be..514eb169f1 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -77,12 +77,25 @@ forward: CMPQ BX, $2048 JLS move_256through2048 + // Check alignment + MOVQ SI, AX + ORQ DI, AX + TESTL $7, AX + JNE unaligned_fwd + + // Aligned - do 8 bytes at a time MOVQ BX, CX SHRQ $3, CX ANDQ $7, BX REP; MOVSQ JMP tail +unaligned_fwd: + // Unaligned - do 1 byte at a time + MOVQ BX, CX + REP; MOVSB + RET + back: /* * check overlap diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index 371d84b6be..8bf0c65e29 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -116,7 +116,7 @@ func BenchmarkMemmove1024(b *testing.B) { bmMemmove(b, 1024) } func BenchmarkMemmove2048(b *testing.B) { bmMemmove(b, 2048) } func BenchmarkMemmove4096(b *testing.B) { bmMemmove(b, 4096) } -func bmMemmoveUnaligned(b *testing.B, n int) { +func bmMemmoveUnalignedDst(b *testing.B, n int) { x := make([]byte, n+1) y := make([]byte, n) b.SetBytes(int64(n)) @@ -125,31 +125,66 @@ func bmMemmoveUnaligned(b *testing.B, n int) { } } -func BenchmarkMemmoveUnaligned0(b *testing.B) { bmMemmoveUnaligned(b, 0) } -func BenchmarkMemmoveUnaligned1(b *testing.B) { bmMemmoveUnaligned(b, 1) } -func BenchmarkMemmoveUnaligned2(b *testing.B) { bmMemmoveUnaligned(b, 2) } -func BenchmarkMemmoveUnaligned3(b *testing.B) { bmMemmoveUnaligned(b, 3) } -func BenchmarkMemmoveUnaligned4(b *testing.B) { bmMemmoveUnaligned(b, 4) } -func BenchmarkMemmoveUnaligned5(b *testing.B) { bmMemmoveUnaligned(b, 5) } -func BenchmarkMemmoveUnaligned6(b *testing.B) { bmMemmoveUnaligned(b, 6) } -func BenchmarkMemmoveUnaligned7(b *testing.B) { bmMemmoveUnaligned(b, 7) } -func BenchmarkMemmoveUnaligned8(b *testing.B) { bmMemmoveUnaligned(b, 8) } -func BenchmarkMemmoveUnaligned9(b *testing.B) { bmMemmoveUnaligned(b, 9) } -func BenchmarkMemmoveUnaligned10(b *testing.B) { bmMemmoveUnaligned(b, 10) } -func BenchmarkMemmoveUnaligned11(b *testing.B) { bmMemmoveUnaligned(b, 11) } -func BenchmarkMemmoveUnaligned12(b *testing.B) { bmMemmoveUnaligned(b, 12) } -func BenchmarkMemmoveUnaligned13(b *testing.B) { bmMemmoveUnaligned(b, 13) } -func BenchmarkMemmoveUnaligned14(b *testing.B) { bmMemmoveUnaligned(b, 14) } -func BenchmarkMemmoveUnaligned15(b *testing.B) { bmMemmoveUnaligned(b, 15) } -func BenchmarkMemmoveUnaligned16(b *testing.B) { bmMemmoveUnaligned(b, 16) } -func BenchmarkMemmoveUnaligned32(b *testing.B) { bmMemmoveUnaligned(b, 32) } -func BenchmarkMemmoveUnaligned64(b *testing.B) { bmMemmoveUnaligned(b, 64) } -func BenchmarkMemmoveUnaligned128(b *testing.B) { bmMemmoveUnaligned(b, 128) } -func BenchmarkMemmoveUnaligned256(b *testing.B) { bmMemmoveUnaligned(b, 256) } -func BenchmarkMemmoveUnaligned512(b *testing.B) { bmMemmoveUnaligned(b, 512) } -func BenchmarkMemmoveUnaligned1024(b *testing.B) { bmMemmoveUnaligned(b, 1024) } -func BenchmarkMemmoveUnaligned2048(b *testing.B) { bmMemmoveUnaligned(b, 2048) } -func BenchmarkMemmoveUnaligned4096(b *testing.B) { bmMemmoveUnaligned(b, 4096) } +func BenchmarkMemmoveUnalignedDst0(b *testing.B) { bmMemmoveUnalignedDst(b, 0) } +func BenchmarkMemmoveUnalignedDst1(b *testing.B) { bmMemmoveUnalignedDst(b, 1) } +func BenchmarkMemmoveUnalignedDst2(b *testing.B) { bmMemmoveUnalignedDst(b, 2) } +func BenchmarkMemmoveUnalignedDst3(b *testing.B) { bmMemmoveUnalignedDst(b, 3) } +func BenchmarkMemmoveUnalignedDst4(b *testing.B) { bmMemmoveUnalignedDst(b, 4) } +func BenchmarkMemmoveUnalignedDst5(b *testing.B) { bmMemmoveUnalignedDst(b, 5) } +func BenchmarkMemmoveUnalignedDst6(b *testing.B) { bmMemmoveUnalignedDst(b, 6) } +func BenchmarkMemmoveUnalignedDst7(b *testing.B) { bmMemmoveUnalignedDst(b, 7) } +func BenchmarkMemmoveUnalignedDst8(b *testing.B) { bmMemmoveUnalignedDst(b, 8) } +func BenchmarkMemmoveUnalignedDst9(b *testing.B) { bmMemmoveUnalignedDst(b, 9) } +func BenchmarkMemmoveUnalignedDst10(b *testing.B) { bmMemmoveUnalignedDst(b, 10) } +func BenchmarkMemmoveUnalignedDst11(b *testing.B) { bmMemmoveUnalignedDst(b, 11) } +func BenchmarkMemmoveUnalignedDst12(b *testing.B) { bmMemmoveUnalignedDst(b, 12) } +func BenchmarkMemmoveUnalignedDst13(b *testing.B) { bmMemmoveUnalignedDst(b, 13) } +func BenchmarkMemmoveUnalignedDst14(b *testing.B) { bmMemmoveUnalignedDst(b, 14) } +func BenchmarkMemmoveUnalignedDst15(b *testing.B) { bmMemmoveUnalignedDst(b, 15) } +func BenchmarkMemmoveUnalignedDst16(b *testing.B) { bmMemmoveUnalignedDst(b, 16) } +func BenchmarkMemmoveUnalignedDst32(b *testing.B) { bmMemmoveUnalignedDst(b, 32) } +func BenchmarkMemmoveUnalignedDst64(b *testing.B) { bmMemmoveUnalignedDst(b, 64) } +func BenchmarkMemmoveUnalignedDst128(b *testing.B) { bmMemmoveUnalignedDst(b, 128) } +func BenchmarkMemmoveUnalignedDst256(b *testing.B) { bmMemmoveUnalignedDst(b, 256) } +func BenchmarkMemmoveUnalignedDst512(b *testing.B) { bmMemmoveUnalignedDst(b, 512) } +func BenchmarkMemmoveUnalignedDst1024(b *testing.B) { bmMemmoveUnalignedDst(b, 1024) } +func BenchmarkMemmoveUnalignedDst2048(b *testing.B) { bmMemmoveUnalignedDst(b, 2048) } +func BenchmarkMemmoveUnalignedDst4096(b *testing.B) { bmMemmoveUnalignedDst(b, 4096) } + +func bmMemmoveUnalignedSrc(b *testing.B, n int) { + x := make([]byte, n) + y := make([]byte, n+1) + b.SetBytes(int64(n)) + for i := 0; i < b.N; i++ { + copy(x, y[1:]) + } +} + +func BenchmarkMemmoveUnalignedSrc0(b *testing.B) { bmMemmoveUnalignedSrc(b, 0) } +func BenchmarkMemmoveUnalignedSrc1(b *testing.B) { bmMemmoveUnalignedSrc(b, 1) } +func BenchmarkMemmoveUnalignedSrc2(b *testing.B) { bmMemmoveUnalignedSrc(b, 2) } +func BenchmarkMemmoveUnalignedSrc3(b *testing.B) { bmMemmoveUnalignedSrc(b, 3) } +func BenchmarkMemmoveUnalignedSrc4(b *testing.B) { bmMemmoveUnalignedSrc(b, 4) } +func BenchmarkMemmoveUnalignedSrc5(b *testing.B) { bmMemmoveUnalignedSrc(b, 5) } +func BenchmarkMemmoveUnalignedSrc6(b *testing.B) { bmMemmoveUnalignedSrc(b, 6) } +func BenchmarkMemmoveUnalignedSrc7(b *testing.B) { bmMemmoveUnalignedSrc(b, 7) } +func BenchmarkMemmoveUnalignedSrc8(b *testing.B) { bmMemmoveUnalignedSrc(b, 8) } +func BenchmarkMemmoveUnalignedSrc9(b *testing.B) { bmMemmoveUnalignedSrc(b, 9) } +func BenchmarkMemmoveUnalignedSrc10(b *testing.B) { bmMemmoveUnalignedSrc(b, 10) } +func BenchmarkMemmoveUnalignedSrc11(b *testing.B) { bmMemmoveUnalignedSrc(b, 11) } +func BenchmarkMemmoveUnalignedSrc12(b *testing.B) { bmMemmoveUnalignedSrc(b, 12) } +func BenchmarkMemmoveUnalignedSrc13(b *testing.B) { bmMemmoveUnalignedSrc(b, 13) } +func BenchmarkMemmoveUnalignedSrc14(b *testing.B) { bmMemmoveUnalignedSrc(b, 14) } +func BenchmarkMemmoveUnalignedSrc15(b *testing.B) { bmMemmoveUnalignedSrc(b, 15) } +func BenchmarkMemmoveUnalignedSrc16(b *testing.B) { bmMemmoveUnalignedSrc(b, 16) } +func BenchmarkMemmoveUnalignedSrc32(b *testing.B) { bmMemmoveUnalignedSrc(b, 32) } +func BenchmarkMemmoveUnalignedSrc64(b *testing.B) { bmMemmoveUnalignedSrc(b, 64) } +func BenchmarkMemmoveUnalignedSrc128(b *testing.B) { bmMemmoveUnalignedSrc(b, 128) } +func BenchmarkMemmoveUnalignedSrc256(b *testing.B) { bmMemmoveUnalignedSrc(b, 256) } +func BenchmarkMemmoveUnalignedSrc512(b *testing.B) { bmMemmoveUnalignedSrc(b, 512) } +func BenchmarkMemmoveUnalignedSrc1024(b *testing.B) { bmMemmoveUnalignedSrc(b, 1024) } +func BenchmarkMemmoveUnalignedSrc2048(b *testing.B) { bmMemmoveUnalignedSrc(b, 2048) } +func BenchmarkMemmoveUnalignedSrc4096(b *testing.B) { bmMemmoveUnalignedSrc(b, 4096) } func TestMemclr(t *testing.T) { size := 512 -- 2.48.1