]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: use word-wise comparison for Compare on arm
authorTobias Klauser <tklauser@distanz.ch>
Tue, 5 Mar 2019 11:09:12 +0000 (12:09 +0100)
committerTobias Klauser <tobias.klauser@gmail.com>
Wed, 6 Mar 2019 16:09:38 +0000 (16:09 +0000)
Use word-wise comparison for aligned buffers, otherwise fall back to the
current byte-wise comparison.

name                           old time/op    new time/op    delta
BytesCompare/1-4                 41.3ns ± 0%    36.4ns ± 1%   -11.73%  (p=0.008 n=5+5)
BytesCompare/2-4                 39.5ns ± 0%    39.5ns ± 1%      ~     (p=0.960 n=5+5)
BytesCompare/4-4                 45.3ns ± 0%    41.0ns ± 1%    -9.40%  (p=0.008 n=5+5)
BytesCompare/8-4                 64.8ns ± 1%    44.7ns ± 0%   -31.12%  (p=0.008 n=5+5)
BytesCompare/16-4                86.3ns ± 0%    55.1ns ± 0%   -36.21%  (p=0.008 n=5+5)
BytesCompare/32-4                 135ns ± 0%      70ns ± 1%   -47.73%  (p=0.008 n=5+5)
BytesCompare/64-4                 231ns ± 1%      99ns ± 0%   -57.27%  (p=0.016 n=5+4)
BytesCompare/128-4                424ns ± 0%     147ns ± 0%   -65.31%  (p=0.000 n=4+5)
BytesCompare/256-4                810ns ± 0%     243ns ± 0%   -69.96%  (p=0.008 n=5+5)
BytesCompare/512-4               1.59µs ± 0%    0.44µs ± 0%   -72.43%  (p=0.008 n=5+5)
BytesCompare/1024-4              3.14µs ± 1%    0.83µs ± 1%   -73.56%  (p=0.008 n=5+5)
BytesCompare/2048-4              6.23µs ± 0%    1.61µs ± 1%   -74.21%  (p=0.008 n=5+5)
CompareBytesEqual-4              79.4ns ± 0%    52.2ns ± 0%   -34.23%  (p=0.008 n=5+5)
CompareBytesToNil-4              31.0ns ± 0%    30.3ns ± 0%    -2.32%  (p=0.008 n=5+5)
CompareBytesEmpty-4              25.7ns ± 0%    25.7ns ± 0%      ~     (p=0.556 n=4+5)
CompareBytesIdentical-4          25.7ns ± 0%    25.7ns ± 0%      ~     (p=1.000 n=5+5)
CompareBytesSameLength-4         49.1ns ± 0%    48.5ns ± 0%    -1.26%  (p=0.008 n=5+5)
CompareBytesDifferentLength-4    49.8ns ± 1%    49.3ns ± 0%    -1.08%  (p=0.008 n=5+5)
CompareBytesBigUnaligned-4       5.71ms ± 1%    5.68ms ± 1%      ~     (p=0.222 n=5+5)
CompareBytesBig-4                4.95ms ± 0%    2.28ms ± 1%   -53.81%  (p=0.008 n=5+5)
CompareBytesBigIdentical-4       27.2ns ± 1%    27.3ns ± 1%      ~     (p=0.310 n=5+5)

name                           old speed      new speed      delta
CompareBytesBigUnaligned-4      184MB/s ± 1%   185MB/s ± 1%      ~     (p=0.222 n=5+5)
CompareBytesBig-4               212MB/s ± 0%   459MB/s ± 1%  +116.51%  (p=0.008 n=5+5)
CompareBytesBigIdentical-4     38.5TB/s ± 0%  38.4TB/s ± 1%      ~     (p=0.421 n=5+5)

Also, this reduces time for TestCompareBytes by about 20 sec on a
linux-arm builder via gomote.

Updates #29001

Change-Id: I25f148739b9ccb7cb1fc97b3d8763549b0a66c16
Reviewed-on: https://go-review.googlesource.com/c/go/+/165338
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/internal/bytealg/compare_arm.s

index c5bfdda33fb027fa16ada91e3b8aff7f04b82fdc..80d01a217fbc968fb06d44fe61781b9801c70e6b 100644 (file)
@@ -29,31 +29,58 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
 // R7 points to return value (-1/0/1 will be written here)
 //
 // On exit:
-// R4, R5, and R6 are clobbered
+// R4, R5, R6 and R8 are clobbered
 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
        CMP     R2, R3
        BEQ     samebytes
        CMP     R0, R1
        MOVW    R0, R6
-       MOVW.LT R1, R6  // R6 is min(R0, R1)
+       MOVW.LT R1, R6          // R6 is min(R0, R1)
 
-       ADD     R2, R6  // R2 is current byte in a, R6 is last byte in a to compare
-loop:
+       CMP     $0, R6
+       BEQ     samebytes
+       CMP     $4, R6
+       ADD     R2, R6          // R2 is current byte in a, R6 is the end of the range to compare
+       BLT     byte_loop       // length < 4
+       AND     $3, R2, R8
+       CMP     $0, R8
+       BNE     byte_loop       // unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+       AND     $3, R3, R8
+       CMP     $0, R8
+       BNE     byte_loop       // unaligned b, use byte-wise compare
+       AND     $0xfffffffc, R6, R8
+       // length >= 4
+chunk4_loop:
+       MOVW.P  4(R2), R4
+       MOVW.P  4(R3), R5
+       CMP     R4, R5
+       BNE     cmp
+       CMP     R2, R8
+       BNE     chunk4_loop
        CMP     R2, R6
-       BEQ     samebytes // all compared bytes were the same; compare lengths
+       BEQ     samebytes       // all compared bytes were the same; compare lengths
+byte_loop:
        MOVBU.P 1(R2), R4
        MOVBU.P 1(R3), R5
        CMP     R4, R5
-       BEQ     loop
-       // bytes differed
+       BNE     ret
+       CMP     R2, R6
+       BNE     byte_loop
+samebytes:
+       CMP     R0, R1
        MOVW.LT $1, R0
        MOVW.GT $-1, R0
+       MOVW.EQ $0, R0
        MOVW    R0, (R7)
        RET
-samebytes:
-       CMP     R0, R1
+ret:
+       // bytes differed
        MOVW.LT $1, R0
        MOVW.GT $-1, R0
-       MOVW.EQ $0, R0
        MOVW    R0, (R7)
        RET
+cmp:
+       SUB     $4, R2, R2
+       SUB     $4, R3, R3
+       B       byte_loop