]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: improve Compare function on amd64 for large byte arrays
authorUttam C Pawar <uttam.c.pawar@intel.com>
Thu, 2 Jul 2015 18:43:46 +0000 (11:43 -0700)
committerKeith Randall <khr@golang.org>
Wed, 26 Aug 2015 03:52:20 +0000 (03:52 +0000)
This patch contains only loop unrolling change for size > 63B

Following are the performance numbers for various sizes on
On Haswell based system: Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz.

benchcmp go.head.8.25.15.txt go.head.8.25.15.opt.txt
benchmark                       old ns/op     new ns/op     delta
BenchmarkBytesCompare1-4        5.37          5.37          +0.00%
BenchmarkBytesCompare2-4        5.37          5.38          +0.19%
BenchmarkBytesCompare4-4        5.37          5.37          +0.00%
BenchmarkBytesCompare8-4        4.42          4.38          -0.90%
BenchmarkBytesCompare16-4       4.27          4.45          +4.22%
BenchmarkBytesCompare32-4       5.30          5.36          +1.13%
BenchmarkBytesCompare64-4       6.93          6.78          -2.16%
BenchmarkBytesCompare128-4      10.3          9.50          -7.77%
BenchmarkBytesCompare256-4      17.1          13.8          -19.30%
BenchmarkBytesCompare512-4      31.3          22.1          -29.39%
BenchmarkBytesCompare1024-4     62.5          39.0          -37.60%
BenchmarkBytesCompare2048-4     112           73.2          -34.64%

Change-Id: I4eeb1c22732fd62cbac97ba757b0d29f648d4ef1
Reviewed-on: https://go-review.googlesource.com/11871
Reviewed-by: Keith Randall <khr@golang.org>
src/bytes/bytes_test.go
src/runtime/asm_amd64.s

index 6245e481805779e8665965053bde7cae4ce31e78..8df62fcc6aec1742a277d5fafd3d4a9705a63d30 100644 (file)
@@ -1255,3 +1255,34 @@ func BenchmarkRepeat(b *testing.B) {
                Repeat([]byte("-"), 80)
        }
 }
+
+func benchmarkBytesCompare(b *testing.B, n int) {
+       var x = make([]byte, n)
+       var y = make([]byte, n)
+
+       for i := 0; i < n; i++ {
+               x[i] = 'a'
+       }
+
+       for i := 0; i < n; i++ {
+               y[i] = 'a'
+       }
+
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               Compare(x, y)
+       }
+}
+
+func BenchmarkBytesCompare1(b *testing.B)    { benchmarkBytesCompare(b, 1) }
+func BenchmarkBytesCompare2(b *testing.B)    { benchmarkBytesCompare(b, 2) }
+func BenchmarkBytesCompare4(b *testing.B)    { benchmarkBytesCompare(b, 4) }
+func BenchmarkBytesCompare8(b *testing.B)    { benchmarkBytesCompare(b, 8) }
+func BenchmarkBytesCompare16(b *testing.B)   { benchmarkBytesCompare(b, 16) }
+func BenchmarkBytesCompare32(b *testing.B)   { benchmarkBytesCompare(b, 32) }
+func BenchmarkBytesCompare64(b *testing.B)   { benchmarkBytesCompare(b, 64) }
+func BenchmarkBytesCompare128(b *testing.B)  { benchmarkBytesCompare(b, 128) }
+func BenchmarkBytesCompare256(b *testing.B)  { benchmarkBytesCompare(b, 256) }
+func BenchmarkBytesCompare512(b *testing.B)  { benchmarkBytesCompare(b, 512) }
+func BenchmarkBytesCompare1024(b *testing.B) { benchmarkBytesCompare(b, 1024) }
+func BenchmarkBytesCompare2048(b *testing.B) { benchmarkBytesCompare(b, 2048) }
index 3b4ca4d012f88fdbc099e19fd3581bdf3356d674..ff2da3a858094e8e818799572f24bbf60fe8f852 100644 (file)
@@ -1445,6 +1445,8 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
        CMPQ    R8, $8
        JB      small
 
+       CMPQ    R8, $63
+       JA      big_loop
 loop:
        CMPQ    R8, $16
        JBE     _0through16
@@ -1459,6 +1461,17 @@ loop:
        SUBQ    $16, R8
        JMP     loop
        
+diff64:
+       ADDQ    $48, SI
+       ADDQ    $48, DI
+       JMP     diff16
+diff48:
+       ADDQ    $32, SI
+       ADDQ    $32, DI
+       JMP     diff16
+diff32:
+       ADDQ    $16, SI
+       ADDQ    $16, DI
        // AX = bit mask of differences
 diff16:
        BSFQ    AX, BX  // index of first byte that differs
@@ -1545,6 +1558,43 @@ allsame:
        MOVQ    AX, (R9)
        RET
 
+       // this works for >= 64 bytes of data.
+big_loop:
+       MOVOU   (SI), X0
+       MOVOU   (DI), X1
+       PCMPEQB X0, X1
+       PMOVMSKB X1, AX
+       XORQ    $0xffff, AX
+       JNE     diff16
+
+       MOVOU   16(SI), X0
+       MOVOU   16(DI), X1
+       PCMPEQB X0, X1
+       PMOVMSKB X1, AX
+       XORQ    $0xffff, AX
+       JNE     diff32
+
+       MOVOU   32(SI), X0
+       MOVOU   32(DI), X1
+       PCMPEQB X0, X1
+       PMOVMSKB X1, AX
+       XORQ    $0xffff, AX
+       JNE     diff48
+
+       MOVOU   48(SI), X0
+       MOVOU   48(DI), X1
+       PCMPEQB X0, X1
+       PMOVMSKB X1, AX
+       XORQ    $0xffff, AX
+       JNE     diff64
+
+       ADDQ    $64, SI
+       ADDQ    $64, DI
+       SUBQ    $64, R8
+       CMPQ    R8, $64
+       JBE     loop
+       JMP     big_loop
+
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
        MOVQ s+0(FP), SI
        MOVQ s_len+8(FP), BX