]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: add optimized Compare for arm64
authorfanzha02 <fannie.zhang@arm.com>
Fri, 26 Jan 2018 09:18:31 +0000 (09:18 +0000)
committerCherry Zhang <cherryyz@google.com>
Tue, 20 Mar 2018 00:06:34 +0000 (00:06 +0000)
Use LDP instructions to load 16 bytes per loop when the source length is long. Specially
process the 8 bytes length, 4 bytes length and 2 bytes length to get a better performance.

Benchmark result:
name                           old time/op   new time/op    delta
BytesCompare/1-8                21.0ns ± 0%    10.5ns ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/2-8                11.5ns ± 0%    10.5ns ± 0%    -8.70%  (p=0.008 n=5+5)
BytesCompare/4-8                13.5ns ± 0%    10.0ns ± 0%   -25.93%  (p=0.008 n=5+5)
BytesCompare/8-8                28.8ns ± 0%     9.5ns ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/16-8               40.5ns ± 0%    10.5ns ± 0%   -74.07%  (p=0.008 n=5+5)
BytesCompare/32-8               64.6ns ± 0%    12.5ns ± 0%   -80.65%  (p=0.008 n=5+5)
BytesCompare/64-8                112ns ± 0%      16ns ± 0%   -85.27%  (p=0.008 n=5+5)
BytesCompare/128-8               208ns ± 0%      24ns ± 0%   -88.22%  (p=0.008 n=5+5)
BytesCompare/256-8               400ns ± 0%      50ns ± 0%   -87.62%  (p=0.008 n=5+5)
BytesCompare/512-8               785ns ± 0%      82ns ± 0%   -89.61%  (p=0.008 n=5+5)
BytesCompare/1024-8             1.55µs ± 0%    0.14µs ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/2048-8             3.09µs ± 0%    0.27µs ± 0%      ~     (p=0.079 n=4+5)
CompareBytesEqual-8             39.0ns ± 0%    12.0ns ± 0%   -69.23%  (p=0.008 n=5+5)
CompareBytesToNil-8             8.57ns ± 5%    8.23ns ± 2%    -3.99%  (p=0.016 n=5+5)
CompareBytesEmpty-8             7.37ns ± 0%    7.36ns ± 4%      ~     (p=0.690 n=5+5)
CompareBytesIdentical-8         7.39ns ± 0%    7.46ns ± 2%      ~     (p=0.667 n=5+5)
CompareBytesSameLength-8        17.0ns ± 0%    10.5ns ± 0%   -38.24%  (p=0.008 n=5+5)
CompareBytesDifferentLength-8   17.0ns ± 0%    10.5ns ± 0%   -38.24%  (p=0.008 n=5+5)
CompareBytesBigUnaligned-8      1.58ms ± 0%    0.19ms ± 0%   -88.31%  (p=0.016 n=4+5)
CompareBytesBig-8               1.59ms ± 0%    0.19ms ± 0%   -88.27%  (p=0.016 n=5+4)
CompareBytesBigIdentical-8      7.01ns ± 0%    6.60ns ± 3%    -5.91%  (p=0.008 n=5+5)

name                           old speed     new speed      delta
CompareBytesBigUnaligned-8     662MB/s ± 0%  5660MB/s ± 0%  +755.15%  (p=0.016 n=4+5)
CompareBytesBig-8              661MB/s ± 0%  5636MB/s ± 0%  +752.57%  (p=0.016 n=5+4)
CompareBytesBigIdentical-8     150TB/s ± 0%   159TB/s ± 3%    +6.27%  (p=0.008 n=5+5)

Change-Id: I70332de06f873df3bc12c4a5af1028307b670046
Reviewed-on: https://go-review.googlesource.com/90175
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/internal/bytealg/compare_arm64.s

index 9b6354715a4c490c7e33d37f3e85db11a03a8ce9..2bd38064c30b4e7bc0d2eb3894d0da40d4ef8192 100644 (file)
@@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
        MOVD    a_len+8(FP), R0
        MOVD    b_base+24(FP), R3
        MOVD    b_len+32(FP), R1
-       ADD     $56, RSP, R7
+       MOVD    $ret+48(FP), R7
        B       cmpbody<>(SB)
 
 TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
@@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
        MOVD    a_len+8(FP), R0
        MOVD    b_base+24(FP), R3
        MOVD    b_len+32(FP), R1
-       ADD     $56, RSP, R7
+       MOVD    $ret+48(FP), R7
        B       cmpbody<>(SB)
 
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
@@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
        MOVD    a_len+8(FP), R0
        MOVD    b_base+16(FP), R3
        MOVD    b_len+24(FP), R1
-       ADD     $40, RSP, R7
+       MOVD    $ret+32(FP), R7
        B       cmpbody<>(SB)
 
 // On entry:
@@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 // R7 points to return value (-1/0/1 will be written here)
 //
 // On exit:
-// R4, R5, and R6 are clobbered
+// R4, R5, R6, R8, R9 and R10 are clobbered
 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
        CMP     R2, R3
-       BEQ     samebytes // same starting pointers; compare lengths
+       BEQ     samebytes         // same starting pointers; compare lengths
        CMP     R0, R1
-       CSEL    LT, R1, R0, R6 // R6 is min(R0, R1)
+       CSEL    LT, R1, R0, R6    // R6 is min(R0, R1)
 
-       ADD     R2, R6  // R2 is current byte in a, R6 is last byte in a to compare
-loop:
-       CMP     R2, R6
-       BEQ     samebytes // all compared bytes were the same; compare lengths
-       MOVBU.P 1(R2), R4
-       MOVBU.P 1(R3), R5
+       CMP     $0, R6
+       BEQ     samebytes
+       BIC     $0xf, R6, R10
+       CBZ     R10, small        // length < 16
+       ADD     R2, R10           // end of chunk16
+       // length >= 16
+chunk16_loop:
+       LDP.P   16(R2), (R4, R8)
+       LDP.P   16(R3), (R5, R9)
        CMP     R4, R5
-       BEQ     loop
-       // bytes differed
+       BNE     cmp
+       CMP     R8, R9
+       BNE     cmpnext
+       CMP     R10, R2
+       BNE     chunk16_loop
+       AND     $0xf, R6, R6
+       CBZ     R6, samebytes
+       SUBS    $8, R6
+       BLT     tail
+       // the length of tail > 8 bytes
+       MOVD.P  8(R2), R4
+       MOVD.P  8(R3), R5
+       CMP     R4, R5
+       BNE     cmp
+       SUB     $8, R6
+       // compare last 8 bytes
+tail:
+       MOVD    (R2)(R6), R4
+       MOVD    (R3)(R6), R5
+       CMP     R4, R5
+       BEQ     samebytes
+cmp:
+       REV     R4, R4
+       REV     R5, R5
+       CMP     R4, R5
+ret:
        MOVD    $1, R4
-       CSNEG   LT, R4, R4, R4
+       CNEG    HI, R4, R4
        MOVD    R4, (R7)
        RET
+small:
+       TBZ     $3, R6, lt_8
+       MOVD    (R2), R4
+       MOVD    (R3), R5
+       CMP     R4, R5
+       BNE     cmp
+       SUBS    $8, R6
+       BEQ     samebytes
+       ADD     $8, R2
+       ADD     $8, R3
+       SUB     $8, R6
+       B       tail
+lt_8:
+       TBZ     $2, R6, lt_4
+       MOVWU   (R2), R4
+       MOVWU   (R3), R5
+       CMPW    R4, R5
+       BNE     cmp
+       SUBS    $4, R6
+       BEQ     samebytes
+       ADD     $4, R2
+       ADD     $4, R3
+lt_4:
+       TBZ     $1, R6, lt_2
+       MOVHU   (R2), R4
+       MOVHU   (R3), R5
+       CMPW    R4, R5
+       BNE     cmp
+       ADD     $2, R2
+       ADD     $2, R3
+lt_2:
+       TBZ     $0, R6, samebytes
+one:
+       MOVBU   (R2), R4
+       MOVBU   (R3), R5
+       CMPW    R4, R5
+       BNE     ret
 samebytes:
-       MOVD    $1, R4
-       CMP     R0, R1
-       CSNEG   LT, R4, R4, R4
-       CSEL    EQ, ZR, R4, R4
+       CMP     R1, R0
+       CSET    NE, R4
+       CNEG    LO, R4, R4
        MOVD    R4, (R7)
        RET
+cmpnext:
+       REV     R8, R4
+       REV     R9, R5
+       CMP     R4, R5
+       B       ret