// R7 = b_len
JMP cmpbody<>(SB)
-// On entry:
-// R5 length of a
-// R7 length of b
-// R4 points to the start of a
-// R6 points to the start of b
+// input:
+// R4: points to the start of a
+// R5: length of a
+// R6: points to the start of b
+// R7: length of b
// for regabi the return value (-1/0/1) in R4
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
- BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
+ BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
SGTU R5, R7, R9
BNE R9, b_lt_a
MOVV R5, R14
JMP entry
+
b_lt_a:
- MOVV R7, R14 // R14 is min(R5, R7)
+ MOVV R7, R14
+
entry:
- ADDV R4, R14, R12 // R4 start of a, R12 end of a
- BEQ R4, R12, cmp_len // minlength is 0
+ BEQ R14, cmp_len // minlength is 0
+ MOVV $32, R15
+ BGE R14, R15, lasx
tail:
- MOVV $2, R15
- BLT R14, R15, cmp1 // min < 2
- SLLV $1, R15
- BLT R14, R15, cmp2 // min < 4
- SLLV $1, R15
- BLT R14, R15, cmp4 // min < 8
- SLLV $1, R15
- BLT R14, R15, cmp8 // min < 16
- SLLV $1, R15
- BLT R14, R15, cmp16 // min < 32
-
-// When min >= 32 bytes, enter the cmp32_loop loop processing:
-// take out 4 8-bytes from a and b in turn for comparison.
-cmp32_loop:
- MOVV (R4), R8
- MOVV (R6), R9
- MOVV 8(R4), R10
- MOVV 8(R6), R11
- BNE R8, R9, cmp8a
- BNE R10, R11, cmp8b
- MOVV 16(R4), R8
- MOVV 16(R6), R9
- MOVV 24(R4), R10
- MOVV 24(R6), R11
- BNE R8, R9, cmp8a
- BNE R10, R11, cmp8b
- ADDV $32, R4
- ADDV $32, R6
- SUBV $32, R14
- BGE R14, R15, cmp32_loop
- BEQ R14, cmp_len
-
-check16:
- MOVV $16, R15
- BLT R14, R15, check8
-cmp16:
- MOVV (R4), R8
- MOVV (R6), R9
- MOVV 8(R4), R10
- MOVV 8(R6), R11
- BNE R8, R9, cmp8a
- BNE R10, R11, cmp8b
- ADDV $16, R4
- ADDV $16, R6
- SUBV $16, R14
- BEQ R14, cmp_len
-
-check8:
MOVV $8, R15
- BLT R14, R15, check4
+ BLT R14, R15, lt_8
+generic8_loop:
+ MOVV (R4), R10
+ MOVV (R6), R11
+ BEQ R10, R11, generic8_equal
+
cmp8:
- MOVV (R4), R8
- MOVV (R6), R9
- BNE R8, R9, cmp8a
+ AND $0xff, R10, R16
+ AND $0xff, R11, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $15, R10, $8, R16
+ BSTRPICKV $15, R11, $8, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $23, R10, $16, R16
+ BSTRPICKV $23, R11, $16, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $31, R10, $24, R16
+ BSTRPICKV $31, R11, $24, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $39, R10, $32, R16
+ BSTRPICKV $39, R11, $32, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $47, R10, $40, R16
+ BSTRPICKV $47, R11, $40, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $55, R10, $48, R16
+ BSTRPICKV $55, R11, $48, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $63, R10, $56, R16
+ BSTRPICKV $63, R11, $56, R17
+ BNE R16, R17, cmp_byte
+
+generic8_equal:
+ ADDV $-8, R14
+ BEQ R14, cmp_len
ADDV $8, R4
ADDV $8, R6
- SUBV $8, R14
- BEQ R14, cmp_len
+ BGE R14, R15, generic8_loop
-check4:
+lt_8:
MOVV $4, R15
- BLT R14, R15, check2
-cmp4:
- MOVW (R4), R8
- MOVW (R6), R9
- BNE R8, R9, cmp8a
+ BLT R14, R15, lt_4
+
+ MOVWU (R4), R10
+ MOVWU (R6), R11
+ BEQ R10, R11, lt_8_equal
+
+ AND $0xff, R10, R16
+ AND $0xff, R11, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $15, R10, $8, R16
+ BSTRPICKV $15, R11, $8, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $23, R10, $16, R16
+ BSTRPICKV $23, R11, $16, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $31, R10, $24, R16
+ BSTRPICKV $31, R11, $24, R17
+ BNE R16, R17, cmp_byte
+
+lt_8_equal:
+ ADDV $-4, R14
+ BEQ R14, cmp_len
ADDV $4, R4
ADDV $4, R6
- SUBV $4, R14
- BEQ R14, cmp_len
-check2:
+lt_4:
MOVV $2, R15
- BLT R14, R15, cmp1
-cmp2:
- MOVH (R4), R8
- MOVH (R6), R9
- BNE R8, R9, cmp8a
+ BLT R14, R15, lt_2
+
+ MOVHU (R4), R10
+ MOVHU (R6), R11
+ BEQ R10, R11, lt_4_equal
+
+ AND $0xff, R10, R16
+ AND $0xff, R11, R17
+ BNE R16, R17, cmp_byte
+
+ BSTRPICKV $15, R10, $8, R16
+ BSTRPICKV $15, R11, $8, R17
+ BNE R16, R17, cmp_byte
+
+lt_4_equal:
+ ADDV $-2, R14
+ BEQ R14, cmp_len
ADDV $2, R4
ADDV $2, R6
- SUBV $2, R14
- BEQ R14, cmp_len
-cmp1:
- BEQ R14, cmp_len
- MOVBU (R4), R8
- MOVBU (R6), R9
- BNE R8, R9, byte_cmp
+lt_2:
+ MOVBU (R4), R16
+ MOVBU (R6), R17
+ BNE R16, R17, cmp_byte
JMP cmp_len
- // Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
-cmp8a:
- MOVV R8, R10
- MOVV R9, R11
-
- // Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
-cmp8b:
- MOVV $0xff, R15
-
- // Take single bytes from R10/R11 in turn for cyclic comparison.
-cmp8_loop:
- AND R10, R15, R8
- AND R11, R15, R9
- BNE R8, R9, byte_cmp
- SLLV $8, R15
- JMP cmp8_loop
-
- // Compare 1 bytes taken from R8/R9 that are known to differ.
-byte_cmp:
- SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
+ // Compare 1 byte taken from R16/R17 that are known to differ.
+cmp_byte:
+ SGTU R16, R17, R4 // R4 = 1 if (R16 > R17)
BNE R0, R4, ret
MOVV $-1, R4
- JMP ret
+ RET
cmp_len:
SGTU R5, R7, R8
ret:
RET
+
+lasx:
+ MOVV $64, R20
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
+ BEQ R9, lsx
+
+ MOVV $128, R15
+ BLT R14, R15, lasx32_loop
+lasx128_loop:
+ XVMOVQ (R4), X0
+ XVMOVQ (R6), X1
+ XVSEQB X0, X1, X0
+ XVSETANYEQB X0, FCC0
+ BFPT lasx_found_0
+
+ XVMOVQ 32(R4), X0
+ XVMOVQ 32(R6), X1
+ XVSEQB X0, X1, X0
+ XVSETANYEQB X0, FCC0
+ BFPT lasx_found_32
+
+ XVMOVQ 64(R4), X0
+ XVMOVQ 64(R6), X1
+ XVSEQB X0, X1, X0
+ XVSETANYEQB X0, FCC0
+ BFPT lasx_found_64
+
+ XVMOVQ 96(R4), X0
+ XVMOVQ 96(R6), X1
+ XVSEQB X0, X1, X0
+ XVSETANYEQB X0, FCC0
+ BFPT lasx_found_96
+
+ ADDV $-128, R14
+ BEQ R14, cmp_len
+ ADDV $128, R4
+ ADDV $128, R6
+ BGE R14, R15, lasx128_loop
+
+ MOVV $32, R15
+ BLT R14, R15, tail
+lasx32_loop:
+ XVMOVQ (R4), X0
+ XVMOVQ (R6), X1
+ XVSEQB X0, X1, X0
+ XVSETANYEQB X0, FCC0
+ BFPT lasx_found_0
+
+ ADDV $-32, R14
+ BEQ R14, cmp_len
+ ADDV $32, R4
+ ADDV $32, R6
+ BGE R14, R15, lasx32_loop
+ JMP tail
+
+lasx_found_0:
+ MOVV R0, R11
+ JMP lasx_find_byte
+
+lasx_found_32:
+ MOVV $32, R11
+ JMP lasx_find_byte
+
+lasx_found_64:
+ MOVV $64, R11
+ JMP lasx_find_byte
+
+lasx_found_96:
+ MOVV $96, R11
+
+lasx_find_byte:
+ XVMOVQ X0.V[0], R10
+ CTOV R10, R10
+ BNE R10, R20, find_byte
+ ADDV $8, R11
+
+ XVMOVQ X0.V[1], R10
+ CTOV R10, R10
+ BNE R10, R20, find_byte
+ ADDV $8, R11
+
+ XVMOVQ X0.V[2], R10
+ CTOV R10, R10
+ BNE R10, R20, find_byte
+ ADDV $8, R11
+
+ XVMOVQ X0.V[3], R10
+ CTOV R10, R10
+ JMP find_byte
+
+lsx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
+ BEQ R9, generic32_loop
+
+ MOVV $64, R15
+ BLT R14, R15, lsx16_loop
+lsx64_loop:
+ VMOVQ (R4), V0
+ VMOVQ (R6), V1
+ VSEQB V0, V1, V0
+ VSETANYEQB V0, FCC0
+ BFPT lsx_found_0
+
+ VMOVQ 16(R4), V0
+ VMOVQ 16(R6), V1
+ VSEQB V0, V1, V0
+ VSETANYEQB V0, FCC0
+ BFPT lsx_found_16
+
+ VMOVQ 32(R4), V0
+ VMOVQ 32(R6), V1
+ VSEQB V0, V1, V0
+ VSETANYEQB V0, FCC0
+ BFPT lsx_found_32
+
+ VMOVQ 48(R4), V0
+ VMOVQ 48(R6), V1
+ VSEQB V0, V1, V0
+ VSETANYEQB V0, FCC0
+ BFPT lsx_found_48
+
+ ADDV $-64, R14
+ BEQ R14, cmp_len
+ ADDV $64, R4
+ ADDV $64, R6
+ BGE R14, R15, lsx64_loop
+
+ MOVV $16, R15
+ BLT R14, R15, tail
+lsx16_loop:
+ VMOVQ (R4), V0
+ VMOVQ (R6), V1
+ VSEQB V0, V1, V0
+ VSETANYEQB V0, FCC0
+ BFPT lsx_found_0
+
+ ADDV $-16, R14
+ BEQ R14, cmp_len
+ ADDV $16, R4
+ ADDV $16, R6
+ BGE R14, R15, lsx16_loop
+ JMP tail
+
+lsx_found_0:
+ MOVV R0, R11
+ JMP lsx_find_byte
+
+lsx_found_16:
+ MOVV $16, R11
+ JMP lsx_find_byte
+
+lsx_found_32:
+ MOVV $32, R11
+ JMP lsx_find_byte
+
+lsx_found_48:
+ MOVV $48, R11
+
+lsx_find_byte:
+ VMOVQ V0.V[0], R10
+ CTOV R10, R10
+ BNE R10, R20, find_byte
+ ADDV $8, R11
+
+ VMOVQ V0.V[1], R10
+ CTOV R10, R10
+
+find_byte:
+ SRLV $3, R10
+ ADDV R10, R11
+ ADDV R11, R4
+ ADDV R11, R6
+ MOVB (R4), R16
+ MOVB (R6), R17
+ JMP cmp_byte
+
+generic32_loop:
+ MOVV (R4), R10
+ MOVV (R6), R11
+ BNE R10, R11, cmp8
+ MOVV 8(R4), R10
+ MOVV 8(R6), R11
+ BNE R10, R11, cmp8
+ MOVV 16(R4), R10
+ MOVV 16(R6), R11
+ BNE R10, R11, cmp8
+ MOVV 24(R4), R10
+ MOVV 24(R6), R11
+ BNE R10, R11, cmp8
+ ADDV $-32, R14
+ BEQ R14, cmp_len
+ ADDV $32, R4
+ ADDV $32, R6
+ MOVV $32, R15
+ BGE R14, R15, generic32_loop
+ JMP tail