]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: Equal perf improvements on ppc64le/ppc64
authorLynn Boger <laboger@linux.vnet.ibm.com>
Mon, 7 Mar 2016 17:59:16 +0000 (11:59 -0600)
committerMinux Ma <minux@golang.org>
Wed, 23 Mar 2016 14:21:15 +0000 (14:21 +0000)
The existing implementation for Equal and similar
functions in the bytes package operate on one byte at
at time.  This performs poorly on ppc64/ppc64le especially
when the byte buffers are large.  This change improves
those functions by loading and comparing double words where
possible.  The common code has been moved to a function
that can be shared by the other functions in this
file which perform the same type of comparison.
Further optimizations are done for the case where
>= 32 bytes are being compared.  The new function
memeqbody is used by memeq_varlen, Equal, and eqstring.

When running the bytes test with -test.bench=Equal

benchmark                     old MB/s     new MB/s     speedup
BenchmarkEqual1               164.83       129.49       0.79x
BenchmarkEqual6               563.51       445.47       0.79x
BenchmarkEqual9               656.15       1099.00      1.67x
BenchmarkEqual15              591.93       1024.30      1.73x
BenchmarkEqual16              613.25       1914.12      3.12x
BenchmarkEqual20              682.37       1687.04      2.47x
BenchmarkEqual32              807.96       3843.29      4.76x
BenchmarkEqual4K              1076.25      23280.51     21.63x
BenchmarkEqual4M              1079.30      13120.14     12.16x
BenchmarkEqual64M             1073.28      10876.92     10.13x

It was determined that the degradation in the smaller byte tests
were due to unfavorable code alignment of the single byte loop.

Fixes #14368

Change-Id: I0dd87382c28887c70f4fbe80877a8ba03c31d7cd
Reviewed-on: https://go-review.googlesource.com/20249
Reviewed-by: Minux Ma <minux@golang.org>
src/runtime/asm_ppc64x.s

index 59bc8a22dded3ac631c632c4f2344c6db74f9532..8d9d01b1046c278d16cb6c4bd91fe6faae7c1e27 100644 (file)
@@ -795,33 +795,13 @@ TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
 TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
        MOVW    (R0), R1
 
-// memequal(p, q unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
-       MOVD    a+0(FP), R3
-       MOVD    b+8(FP), R4
-       CMP     R3, R4
-       BEQ     eq
-       MOVD    size+16(FP), R5
-       SUB     $1, R3
-       SUB     $1, R4
-       ADD     R3, R5, R8
-loop:
-       CMP     R3, R8
-       BNE     test
-       MOVD    $1, R3
-       MOVB    R3, ret+24(FP)
-       RET
-test:
-       MOVBZU  1(R3), R6
-       MOVBZU  1(R4), R7
-       CMP     R6, R7
-       BEQ     loop
+TEXT runtime·memequal(SB),NOSPLIT,$0-25
+       MOVD    a+0(FP), R3
+       MOVD    b+8(FP), R4
+       MOVD    size+16(FP), R5
 
-       MOVB    R0, ret+24(FP)
-       RET
-eq:
-       MOVD    $1, R1
-       MOVB    R1, ret+24(FP)
+       BL      runtime·memeqbody(SB)
+       MOVB    R9, ret+24(FP)
        RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
@@ -831,75 +811,129 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
        CMP     R3, R4
        BEQ     eq
        MOVD    8(R11), R5    // compiler stores size at offset 8 in the closure
-       MOVD    R3, FIXED_FRAME+0(R1)
-       MOVD    R4, FIXED_FRAME+8(R1)
-       MOVD    R5, FIXED_FRAME+16(R1)
-       BL      runtime·memequal(SB)
-       MOVBZ   FIXED_FRAME+24(R1), R3
-       MOVB    R3, ret+16(FP)
+       BL      runtime·memeqbody(SB)
+       MOVB    R9, ret+16(FP)
        RET
 eq:
        MOVD    $1, R3
        MOVB    R3, ret+16(FP)
        RET
 
+// Do an efficieint memequal for ppc64
+// for reuse where possible.
+// R3 = s1
+// R4 = s2
+// R5 = len
+// R9 = return value
+// R6, R7 clobbered
+TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
+       MOVD    R5,CTR
+       CMP     R5,$8           // only optimize >=8
+       BLT     simplecheck
+       DCBT    (R3)            // cache hint
+       DCBT    (R4)
+       CMP     R5,$32          // optimize >= 32
+       MOVD    R5,R6           // needed if setup8a branch
+       BLT     setup8a         // 8 byte moves only
+setup32a:                       // 8 byte aligned, >= 32 bytes
+       SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
+       MOVD    R6,CTR
+loop32a:
+       MOVD    0(R3),R6        // doublewords to compare
+       MOVD    0(R4),R7
+       MOVD    8(R3),R8        //
+       MOVD    8(R4),R9
+       CMP     R6,R7           // bytes batch?
+       BNE     noteq
+       MOVD    16(R3),R6
+       MOVD    16(R4),R7
+       CMP     R8,R9           // bytes match?
+       MOVD    24(R3),R8
+       MOVD    24(R4),R9
+       BNE     noteq
+       CMP     R6,R7           // bytes match?
+       BNE     noteq
+       ADD     $32,R3          // bump up to next 32
+       ADD     $32,R4
+       CMP     R8,R9           // bytes match?
+       BC      8,2,loop32a     // br ctr and cr
+       BNE     noteq
+       ANDCC   $24,R5,R6       // Any 8 byte chunks?
+       BEQ     leftover        // and result is 0
+setup8a:
+       SRADCC  $3,R6,R6        // get the 8 byte count
+       BEQ     leftover        // shifted value is 0
+       MOVD    R6,CTR
+loop8:
+       MOVD    0(R3),R6        // doublewords to compare
+       ADD     $8,R3
+       MOVD    0(R4),R7
+       ADD     $8,R4
+       CMP     R6,R7           // match?
+       BC      8,2,loop8       // bt ctr <> 0 && cr
+       BNE     noteq
+leftover:
+       ANDCC   $7,R5,R6        // check for leftover bytes
+       BEQ     equal
+       MOVD    R6,CTR
+       BR      simple
+simplecheck:
+       CMP     R5,$0
+       BEQ     equal
+simple:
+       MOVBZ   0(R3), R6
+       ADD     $1,R3
+       MOVBZ   0(R4), R7
+       ADD     $1,R4
+       CMP     R6, R7
+       BNE     noteq
+       BC      8,2,simple
+       BNE     noteq
+       BR      equal
+noteq:
+       MOVD    $0, R9
+       RET
+equal:
+       MOVD    $1, R9
+       RET
+
 // eqstring tests whether two strings are equal.
 // The compiler guarantees that strings passed
 // to eqstring have equal length.
 // See runtime_test.go:eqstring_generic for
 // equivalent Go code.
 TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-       MOVD    s1str+0(FP), R3
-       MOVD    s2str+16(FP), R4
-       MOVD    $1, R5
-       MOVB    R5, ret+32(FP)
-       CMP     R3, R4
-       BNE     2(PC)
+       MOVD    s1str+0(FP), R3
+       MOVD    s2str+16(FP), R4
+       MOVD    $1, R5
+       MOVB    R5, ret+32(FP)
+       CMP     R3, R4
+       BNE     2(PC)
        RET
-       MOVD    s1len+8(FP), R5
-       SUB     $1, R3
-       SUB     $1, R4
-       ADD     R3, R5, R8
-loop:
-       CMP     R3, R8
-       BNE     2(PC)
-       RET
-       MOVBZU  1(R3), R6
-       MOVBZU  1(R4), R7
-       CMP     R6, R7
-       BEQ     loop
-       MOVB    R0, ret+32(FP)
+       MOVD    s1len+8(FP), R5
+       BL      runtime·memeqbody(SB)
+       MOVB    R9, ret+32(FP)
        RET
 
-// TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
-       MOVD    a_len+8(FP), R3
-       MOVD    b_len+32(FP), R4
-
-       CMP     R3, R4          // unequal lengths are not equal
+       MOVD    a_len+8(FP), R4
+       MOVD    b_len+32(FP), R5
+       CMP     R5, R4          // unequal lengths are not equal
        BNE     noteq
+       MOVD    a+0(FP), R3
+       MOVD    b+24(FP), R4
+       BL      runtime·memeqbody(SB)
 
-       MOVD    a+0(FP), R5
-       MOVD    b+24(FP), R6
-       SUB     $1, R5
-       SUB     $1, R6
-       ADD     R5, R3          // end-1
-
-loop:
-       CMP     R5, R3
-       BEQ     equal           // reached the end
-       MOVBZU  1(R5), R4
-       MOVBZU  1(R6), R7
-       CMP     R4, R7
-       BEQ     loop
+       MOVBZ   R9,ret+48(FP)
+       RET
 
 noteq:
-       MOVBZ   R0, ret+48(FP)
+       MOVBZ   $0,ret+48(FP)
        RET
 
 equal:
-       MOVD    $1, R3
-       MOVBZ   R3, ret+48(FP)
+       MOVD    $1,R3
+       MOVBZ   R3,ret+48(FP)
        RET
 
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40