From: Uttam C Pawar Date: Thu, 2 Jul 2015 18:43:46 +0000 (-0700) Subject: bytes: improve Compare function on amd64 for large byte arrays X-Git-Tag: go1.6beta1~1266 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=32add8d7c8433d87aca782ddcd79898922ac96b7;p=gostls13.git bytes: improve Compare function on amd64 for large byte arrays This patch contains only loop unrolling change for size > 63B Following are the performance numbers for various sizes on On Haswell based system: Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz. benchcmp go.head.8.25.15.txt go.head.8.25.15.opt.txt benchmark old ns/op new ns/op delta BenchmarkBytesCompare1-4 5.37 5.37 +0.00% BenchmarkBytesCompare2-4 5.37 5.38 +0.19% BenchmarkBytesCompare4-4 5.37 5.37 +0.00% BenchmarkBytesCompare8-4 4.42 4.38 -0.90% BenchmarkBytesCompare16-4 4.27 4.45 +4.22% BenchmarkBytesCompare32-4 5.30 5.36 +1.13% BenchmarkBytesCompare64-4 6.93 6.78 -2.16% BenchmarkBytesCompare128-4 10.3 9.50 -7.77% BenchmarkBytesCompare256-4 17.1 13.8 -19.30% BenchmarkBytesCompare512-4 31.3 22.1 -29.39% BenchmarkBytesCompare1024-4 62.5 39.0 -37.60% BenchmarkBytesCompare2048-4 112 73.2 -34.64% Change-Id: I4eeb1c22732fd62cbac97ba757b0d29f648d4ef1 Reviewed-on: https://go-review.googlesource.com/11871 Reviewed-by: Keith Randall --- diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 6245e48180..8df62fcc6a 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -1255,3 +1255,34 @@ func BenchmarkRepeat(b *testing.B) { Repeat([]byte("-"), 80) } } + +func benchmarkBytesCompare(b *testing.B, n int) { + var x = make([]byte, n) + var y = make([]byte, n) + + for i := 0; i < n; i++ { + x[i] = 'a' + } + + for i := 0; i < n; i++ { + y[i] = 'a' + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + Compare(x, y) + } +} + +func BenchmarkBytesCompare1(b *testing.B) { benchmarkBytesCompare(b, 1) } +func BenchmarkBytesCompare2(b *testing.B) { benchmarkBytesCompare(b, 2) } +func BenchmarkBytesCompare4(b *testing.B) { benchmarkBytesCompare(b, 4) } +func BenchmarkBytesCompare8(b *testing.B) { benchmarkBytesCompare(b, 8) } +func BenchmarkBytesCompare16(b *testing.B) { benchmarkBytesCompare(b, 16) } +func BenchmarkBytesCompare32(b *testing.B) { benchmarkBytesCompare(b, 32) } +func BenchmarkBytesCompare64(b *testing.B) { benchmarkBytesCompare(b, 64) } +func BenchmarkBytesCompare128(b *testing.B) { benchmarkBytesCompare(b, 128) } +func BenchmarkBytesCompare256(b *testing.B) { benchmarkBytesCompare(b, 256) } +func BenchmarkBytesCompare512(b *testing.B) { benchmarkBytesCompare(b, 512) } +func BenchmarkBytesCompare1024(b *testing.B) { benchmarkBytesCompare(b, 1024) } +func BenchmarkBytesCompare2048(b *testing.B) { benchmarkBytesCompare(b, 2048) } diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 3b4ca4d012..ff2da3a858 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1445,6 +1445,8 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPQ R8, $8 JB small + CMPQ R8, $63 + JA big_loop loop: CMPQ R8, $16 JBE _0through16 @@ -1459,6 +1461,17 @@ loop: SUBQ $16, R8 JMP loop +diff64: + ADDQ $48, SI + ADDQ $48, DI + JMP diff16 +diff48: + ADDQ $32, SI + ADDQ $32, DI + JMP diff16 +diff32: + ADDQ $16, SI + ADDQ $16, DI // AX = bit mask of differences diff16: BSFQ AX, BX // index of first byte that differs @@ -1545,6 +1558,43 @@ allsame: MOVQ AX, (R9) RET + // this works for >= 64 bytes of data. +big_loop: + MOVOU (SI), X0 + MOVOU (DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff16 + + MOVOU 16(SI), X0 + MOVOU 16(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff32 + + MOVOU 32(SI), X0 + MOVOU 32(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff48 + + MOVOU 48(SI), X0 + MOVOU 48(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff64 + + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, R8 + CMPQ R8, $64 + JBE loop + JMP big_loop + TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 MOVQ s+0(FP), SI MOVQ s_len+8(FP), BX