name old time/op new time/op delta
Equal/0 9.94ns ± 4% 9.12ns ± 5% -8.26% (p=0.000 n=10+10)
Equal/1 24.5ns ± 0% 27.2ns ± 1% +11.22% (p=0.000 n=9+10)
Equal/6 28.1ns ± 0% 32.1ns ± 1% +14.20% (p=0.000 n=8+10)
Equal/9 37.1ns ± 0% 37.8ns ± 1% +1.95% (p=0.000 n=8+9)
Equal/15 47.3ns ± 0% 44.3ns ± 0% -6.34% (p=0.000 n=9+10)
Equal/16 42.9ns ± 0% 24.6ns ± 0% -42.66% (p=0.000 n=10+7)
Equal/20 44.3ns ± 0% 57.4ns ± 0% +29.57% (p=0.000 n=9+10)
Equal/32 63.2ns ± 0% 35.8ns ± 0% -43.35% (p=0.000 n=10+10)
Equal/4K 6.49µs ± 0% 0.50µs ± 0% -92.27% (p=0.000 n=10+8)
Equal/4M 6.70ms ± 0% 0.48ms ± 0% -92.78% (p=0.000 n=8+10)
Equal/64M 110ms ± 0% 8ms ± 0% -92.65% (p=0.000 n=9+9)
CompareBytesEqual 36.6ns ± 0% 35.9ns ± 0% -1.83% (p=0.000 n=10+9)
name old speed new speed delta
Equal/1 40.8MB/s ± 0% 36.7MB/s ± 0% -10.16% (p=0.000 n=10+10)
Equal/6 213MB/s ± 0% 187MB/s ± 1% -12.32% (p=0.000 n=10+10)
Equal/9 243MB/s ± 0% 238MB/s ± 1% -1.94% (p=0.000 n=9+10)
Equal/15 317MB/s ± 0% 339MB/s ± 0% +6.86% (p=0.000 n=9+9)
Equal/16 373MB/s ± 0% 651MB/s ± 0% +74.70% (p=0.000 n=8+10)
Equal/20 452MB/s ± 0% 348MB/s ± 0% -22.90% (p=0.000 n=8+10)
Equal/32 506MB/s ± 0% 893MB/s ± 0% +76.53% (p=0.000 n=10+9)
Equal/4K 631MB/s ± 0% 8166MB/s ± 0% +1194.73% (p=0.000 n=10+10)
Equal/4M 626MB/s ± 0% 8673MB/s ± 0% +1284.94% (p=0.000 n=8+10)
Equal/64M 608MB/s ± 0% 8277MB/s ± 0% +1260.83% (p=0.000 n=9+9)
Change-Id: I1cd14ade16390a5097a8d4e9721d5e822fa6218f
Reviewed-on: https://go-review.googlesource.com/c/go/+/199597
Run-TryBot: Meng Zhuo <mzh@golangcn.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Trust: Meng Zhuo <mzh@golangcn.org>
BEQ R1, R2, eq
MOVV size+16(FP), R3
ADDV R1, R3, R4
-loop:
- BNE R1, R4, test
+
+ // chunk size is 16
+ SGTU $16, R3, R8
+ BEQ R0, R8, chunk_entry
+
+byte_loop:
+ BNE R1, R4, byte_test
MOVV $1, R1
MOVB R1, ret+24(FP)
RET
-test:
+byte_test:
MOVBU (R1), R6
ADDV $1, R1
MOVBU (R2), R7
ADDV $1, R2
- BEQ R6, R7, loop
+ BEQ R6, R7, byte_loop
+ JMP not_eq
+
+chunk_entry:
+ // make sure both a and b are aligned
+ OR R1, R2, R9
+ AND $0x7, R9
+ BNE R0, R9, byte_loop
+ JMP chunk_loop_1
+
+chunk_loop:
+ // chunk size is 16
+ SGTU $16, R3, R8
+ BNE R0, R8, chunk_tail_8
+chunk_loop_1:
+ MOVV (R1), R6
+ MOVV (R2), R7
+ BNE R6, R7, not_eq
+ MOVV 8(R1), R12
+ MOVV 8(R2), R13
+ ADDV $16, R1
+ ADDV $16, R2
+ SUBV $16, R3
+ BEQ R12, R13, chunk_loop
+ JMP not_eq
+
+chunk_tail_8:
+ AND $8, R3, R14
+ BEQ R0, R14, chunk_tail_4
+ MOVV (R1), R6
+ MOVV (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $8, R1
+ ADDV $8, R2
+
+chunk_tail_4:
+ AND $4, R3, R14
+ BEQ R0, R14, chunk_tail_2
+ MOVWU (R1), R6
+ MOVWU (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $4, R1
+ ADDV $4, R2
+
+chunk_tail_2:
+ AND $2, R3, R14
+ BEQ R0, R14, chunk_tail_1
+ MOVHU (R1), R6
+ MOVHU (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $2, R1
+ ADDV $2, R2
+
+chunk_tail_1:
+ AND $1, R3, R14
+ BEQ R0, R14, eq
+ MOVBU (R1), R6
+ MOVBU (R2), R7
+ BEQ R6, R7, eq
+not_eq:
MOVB R0, ret+24(FP)
RET
eq: