]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: improve mips64x equal on large size
authorMeng Zhuo <mzh@golangcn.org>
Thu, 22 Oct 2020 06:03:11 +0000 (14:03 +0800)
committerCherry Zhang <cherryyz@google.com>
Fri, 23 Oct 2020 15:53:01 +0000 (15:53 +0000)
name               old time/op    new time/op    delta
Equal/0              9.94ns ± 4%    9.12ns ± 5%     -8.26%  (p=0.000 n=10+10)
Equal/1              24.5ns ± 0%    27.2ns ± 1%    +11.22%  (p=0.000 n=9+10)
Equal/6              28.1ns ± 0%    32.1ns ± 1%    +14.20%  (p=0.000 n=8+10)
Equal/9              37.1ns ± 0%    37.8ns ± 1%     +1.95%  (p=0.000 n=8+9)
Equal/15             47.3ns ± 0%    44.3ns ± 0%     -6.34%  (p=0.000 n=9+10)
Equal/16             42.9ns ± 0%    24.6ns ± 0%    -42.66%  (p=0.000 n=10+7)
Equal/20             44.3ns ± 0%    57.4ns ± 0%    +29.57%  (p=0.000 n=9+10)
Equal/32             63.2ns ± 0%    35.8ns ± 0%    -43.35%  (p=0.000 n=10+10)
Equal/4K             6.49µs ± 0%    0.50µs ± 0%    -92.27%  (p=0.000 n=10+8)
Equal/4M             6.70ms ± 0%    0.48ms ± 0%    -92.78%  (p=0.000 n=8+10)
Equal/64M             110ms ± 0%       8ms ± 0%    -92.65%  (p=0.000 n=9+9)
CompareBytesEqual    36.6ns ± 0%    35.9ns ± 0%     -1.83%  (p=0.000 n=10+9)

name               old speed      new speed      delta
Equal/1            40.8MB/s ± 0%  36.7MB/s ± 0%    -10.16%  (p=0.000 n=10+10)
Equal/6             213MB/s ± 0%   187MB/s ± 1%    -12.32%  (p=0.000 n=10+10)
Equal/9             243MB/s ± 0%   238MB/s ± 1%     -1.94%  (p=0.000 n=9+10)
Equal/15            317MB/s ± 0%   339MB/s ± 0%     +6.86%  (p=0.000 n=9+9)
Equal/16            373MB/s ± 0%   651MB/s ± 0%    +74.70%  (p=0.000 n=8+10)
Equal/20            452MB/s ± 0%   348MB/s ± 0%    -22.90%  (p=0.000 n=8+10)
Equal/32            506MB/s ± 0%   893MB/s ± 0%    +76.53%  (p=0.000 n=10+9)
Equal/4K            631MB/s ± 0%  8166MB/s ± 0%  +1194.73%  (p=0.000 n=10+10)
Equal/4M            626MB/s ± 0%  8673MB/s ± 0%  +1284.94%  (p=0.000 n=8+10)
Equal/64M           608MB/s ± 0%  8277MB/s ± 0%  +1260.83%  (p=0.000 n=9+9)

Change-Id: I1cd14ade16390a5097a8d4e9721d5e822fa6218f
Reviewed-on: https://go-review.googlesource.com/c/go/+/199597
Run-TryBot: Meng Zhuo <mzh@golangcn.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: Meng Zhuo <mzh@golangcn.org>

src/internal/bytealg/equal_mips64x.s

index 58dc4303b4844641ac85a8edb44b6d51c927592f..641e3ff06caf885e126eaefcdd438110171a2f97 100644 (file)
@@ -16,18 +16,82 @@ TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
        BEQ     R1, R2, eq
        MOVV    size+16(FP), R3
        ADDV    R1, R3, R4
-loop:
-       BNE     R1, R4, test
+
+       // chunk size is 16
+       SGTU    $16, R3, R8
+       BEQ     R0, R8, chunk_entry
+
+byte_loop:
+       BNE     R1, R4, byte_test
        MOVV    $1, R1
        MOVB    R1, ret+24(FP)
        RET
-test:
+byte_test:
        MOVBU   (R1), R6
        ADDV    $1, R1
        MOVBU   (R2), R7
        ADDV    $1, R2
-       BEQ     R6, R7, loop
+       BEQ     R6, R7, byte_loop
+       JMP     not_eq
+
+chunk_entry:
+       // make sure both a and b are aligned
+       OR      R1, R2, R9
+       AND     $0x7, R9
+       BNE     R0, R9, byte_loop
+       JMP     chunk_loop_1
+
+chunk_loop:
+       // chunk size is 16
+       SGTU    $16, R3, R8
+       BNE     R0, R8, chunk_tail_8
+chunk_loop_1:
+       MOVV    (R1), R6
+       MOVV    (R2), R7
+       BNE     R6, R7, not_eq
+       MOVV    8(R1), R12
+       MOVV    8(R2), R13
+       ADDV    $16, R1
+       ADDV    $16, R2
+       SUBV    $16, R3
+       BEQ     R12, R13, chunk_loop
+       JMP     not_eq
+
+chunk_tail_8:
+       AND     $8, R3, R14
+       BEQ     R0, R14, chunk_tail_4
+       MOVV    (R1), R6
+       MOVV    (R2), R7
+       BNE     R6, R7, not_eq
+       ADDV    $8, R1
+       ADDV    $8, R2
+
+chunk_tail_4:
+       AND     $4, R3, R14
+       BEQ     R0, R14, chunk_tail_2
+       MOVWU   (R1), R6
+       MOVWU   (R2), R7
+       BNE     R6, R7, not_eq
+       ADDV    $4, R1
+       ADDV    $4, R2
+
+chunk_tail_2:
+       AND     $2, R3, R14
+       BEQ     R0, R14, chunk_tail_1
+       MOVHU   (R1), R6
+       MOVHU   (R2), R7
+       BNE     R6, R7, not_eq
+       ADDV    $2, R1
+       ADDV    $2, R2
+
+chunk_tail_1:
+       AND     $1, R3, R14
+       BEQ     R0, R14, eq
+       MOVBU   (R1), R6
+       MOVBU   (R2), R7
+       BEQ     R6, R7, eq
 
+not_eq:
        MOVB    R0, ret+24(FP)
        RET
 eq: