BEQZ X5, cmp_len
MOV $32, X6
- BLT X5, X6, loop4_check
+ BLT X5, X6, check8_unaligned
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X7
AND $7, X12, X8
- BNE X7, X8, loop4_check
- BEQZ X7, loop32_check
+ BNE X7, X8, check8_unaligned
+ BEQZ X7, compare32
// Check one byte at a time until we reach 8 byte alignment.
SUB X7, X5, X5
ADD $1, X12
BNEZ X7, align
-loop32_check:
- MOV $32, X7
- BLT X5, X7, loop16_check
-loop32:
+check32:
+ MOV $32, X6
+ BLT X5, X6, compare16
+compare32:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
- BEQ X15, X16, loop32a
- JMP cmp8a
-loop32a:
- BEQ X17, X18, loop32b
- JMP cmp8b
-loop32b:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
MOV 16(X10), X15
MOV 16(X12), X16
MOV 24(X10), X17
MOV 24(X12), X18
- BEQ X15, X16, loop32c
- JMP cmp8a
-loop32c:
- BEQ X17, X18, loop32d
- JMP cmp8b
-loop32d:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
ADD $32, X10
ADD $32, X12
ADD $-32, X5
- BGE X5, X7, loop32
+ BGE X5, X6, compare32
BEQZ X5, cmp_len
-loop16_check:
+check16:
MOV $16, X6
- BLT X5, X6, loop4_check
-loop16:
+ BLT X5, X6, check8_unaligned
+compare16:
MOV 0(X10), X15
MOV 0(X12), X16
MOV 8(X10), X17
MOV 8(X12), X18
- BEQ X15, X16, loop16a
- JMP cmp8a
-loop16a:
- BEQ X17, X18, loop16b
- JMP cmp8b
-loop16b:
+ BNE X15, X16, cmp8a
+ BNE X17, X18, cmp8b
ADD $16, X10
ADD $16, X12
ADD $-16, X5
- BGE X5, X6, loop16
BEQZ X5, cmp_len
-loop4_check:
- MOV $4, X6
- BLT X5, X6, loop1
-loop4:
+check8_unaligned:
+ MOV $8, X6
+ BLT X5, X6, check4_unaligned
+compare8_unaligned:
MOVBU 0(X10), X8
+ MOVBU 1(X10), X15
+ MOVBU 2(X10), X17
+ MOVBU 3(X10), X19
+ MOVBU 4(X10), X21
+ MOVBU 5(X10), X23
+ MOVBU 6(X10), X25
+ MOVBU 7(X10), X29
MOVBU 0(X12), X9
+ MOVBU 1(X12), X16
+ MOVBU 2(X12), X18
+ MOVBU 3(X12), X20
+ MOVBU 4(X12), X22
+ MOVBU 5(X12), X24
+ MOVBU 6(X12), X28
+ MOVBU 7(X12), X30
+ BNE X8, X9, cmp1a
+ BNE X15, X16, cmp1b
+ BNE X17, X18, cmp1c
+ BNE X19, X20, cmp1d
+ BNE X21, X22, cmp1e
+ BNE X23, X24, cmp1f
+ BNE X25, X28, cmp1g
+ BNE X29, X30, cmp1h
+ ADD $8, X10
+ ADD $8, X12
+ ADD $-8, X5
+ BGE X5, X6, compare8_unaligned
+ BEQZ X5, cmp_len
+
+check4_unaligned:
+ MOV $4, X6
+ BLT X5, X6, compare1
+compare4_unaligned:
+ MOVBU 0(X10), X8
MOVBU 1(X10), X15
+ MOVBU 2(X10), X17
+ MOVBU 3(X10), X19
+ MOVBU 0(X12), X9
MOVBU 1(X12), X16
- BEQ X8, X9, loop4a
- SLTU X9, X8, X5
- SLTU X8, X9, X6
- JMP cmp_ret
-loop4a:
- BEQ X15, X16, loop4b
- SLTU X16, X15, X5
- SLTU X15, X16, X6
- JMP cmp_ret
-loop4b:
- MOVBU 2(X10), X21
- MOVBU 2(X12), X22
- MOVBU 3(X10), X23
- MOVBU 3(X12), X24
- BEQ X21, X22, loop4c
- SLTU X22, X21, X5
- SLTU X21, X22, X6
- JMP cmp_ret
-loop4c:
- BEQ X23, X24, loop4d
- SLTU X24, X23, X5
- SLTU X23, X24, X6
- JMP cmp_ret
-loop4d:
+ MOVBU 2(X12), X18
+ MOVBU 3(X12), X20
+ BNE X8, X9, cmp1a
+ BNE X15, X16, cmp1b
+ BNE X17, X18, cmp1c
+ BNE X19, X20, cmp1d
ADD $4, X10
ADD $4, X12
ADD $-4, X5
- BGE X5, X6, loop4
+ BGE X5, X6, compare4_unaligned
-loop1:
+compare1:
BEQZ X5, cmp_len
MOVBU 0(X10), X8
MOVBU 0(X12), X9
ADD $1, X10
ADD $1, X12
ADD $-1, X5
- JMP loop1
+ JMP compare1
// Compare 8 bytes of memory in X15/X16 that are known to differ.
cmp8a:
- MOV $0xff, X19
-cmp8a_loop:
- AND X15, X19, X8
- AND X16, X19, X9
- BNE X8, X9, cmp
- SLLI $8, X19
- JMP cmp8a_loop
+ MOV X15, X17
+ MOV X16, X18
// Compare 8 bytes of memory in X17/X18 that are known to differ.
cmp8b:
MOV $0xff, X19
-cmp8b_loop:
+cmp8_loop:
AND X17, X19, X8
AND X18, X19, X9
BNE X8, X9, cmp
SLLI $8, X19
- JMP cmp8b_loop
+ JMP cmp8_loop
+
+cmp1a:
+ SLTU X9, X8, X5
+ SLTU X8, X9, X6
+ JMP cmp_ret
+cmp1b:
+ SLTU X16, X15, X5
+ SLTU X15, X16, X6
+ JMP cmp_ret
+cmp1c:
+ SLTU X18, X17, X5
+ SLTU X17, X18, X6
+ JMP cmp_ret
+cmp1d:
+ SLTU X20, X19, X5
+ SLTU X19, X20, X6
+ JMP cmp_ret
+cmp1e:
+ SLTU X22, X21, X5
+ SLTU X21, X22, X6
+ JMP cmp_ret
+cmp1f:
+ SLTU X24, X23, X5
+ SLTU X23, X24, X6
+ JMP cmp_ret
+cmp1g:
+ SLTU X28, X25, X5
+ SLTU X25, X28, X6
+ JMP cmp_ret
+cmp1h:
+ SLTU X30, X29, X5
+ SLTU X29, X30, X6
+ JMP cmp_ret
cmp_len:
MOV X11, X8