#define REGCTXT R29
// memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
- BEQ R4, R5, eq
- ADDV R4, R6, R7
- PCALIGN $16
-loop:
- BNE R4, R7, test
- MOVV $1, R4
- RET
-test:
- MOVBU (R4), R9
- ADDV $1, R4
- MOVBU (R5), R10
- ADDV $1, R5
- BEQ R9, R10, loop
-
- MOVB R0, R4
- RET
-eq:
- MOVV $1, R4
- RET
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
+ // R4 = a_base
+ // R5 = b_base
+ // R6 = size
+ JMP equalbody<>(SB)
// memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
- BEQ R4, R5, eq
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
+ // R4 = a_base
+ // R5 = b_base
MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
- MOVV R4, 8(R3)
- MOVV R5, 16(R3)
- MOVV R6, 24(R3)
- JAL runtime·memequal(SB)
- MOVBU 32(R3), R4
- RET
-eq:
+ JMP equalbody<>(SB)
+
+// input:
+// R4 = a_base
+// R5 = b_base
+// R6 = size
+TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
+ // a_base == b_base
+ BEQ R4, R5, equal
+ // 0 bytes
+ BEQ R6, equal
+
+ MOVV $64, R7
+ BGE R6, R7, lasx
+
+ // size < 64 bytes
+tail:
+ MOVV $16, R7
+ BLT R6, R7, lt_16
+generic16_loop:
+ ADDV $-16, R6
+ MOVV 0(R4), R8
+ MOVV 8(R4), R9
+ MOVV 0(R5), R10
+ MOVV 8(R5), R11
+ BNE R8, R10, not_equal
+ BNE R9, R11, not_equal
+ BEQ R6, equal
+ ADDV $16, R4
+ ADDV $16, R5
+ BGE R6, R7, generic16_loop
+
+ // size < 16 bytes
+lt_16:
+ MOVV $8, R7
+ BLT R6, R7, lt_8
+ ADDV $-8, R6
+ MOVV 0(R4), R8
+ MOVV 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $8, R4
+ ADDV $8, R5
+
+ // size < 8 bytes
+lt_8:
+ MOVV $4, R7
+ BLT R6, R7, lt_4
+ ADDV $-4, R6
+ MOVW 0(R4), R8
+ MOVW 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $4, R4
+ ADDV $4, R5
+
+ // size < 4 bytes
+lt_4:
+ MOVV $2, R7
+ BLT R6, R7, lt_2
+ ADDV $-2, R6
+ MOVH 0(R4), R8
+ MOVH 0(R5), R9
+ BNE R8, R9, not_equal
+ BEQ R6, equal
+ ADDV $2, R4
+ ADDV $2, R5
+
+ // size < 2 bytes
+lt_2:
+ MOVB 0(R4), R8
+ MOVB 0(R5), R9
+ BNE R8, R9, not_equal
+
+equal:
MOVV $1, R4
RET
+
+not_equal:
+ MOVV R0, R4
+ RET
+
+ // Implemented using 256-bit SIMD instructions
+lasx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+ BEQ R7, lsx
+
+lasx256:
+ MOVV $256, R7
+ BLT R6, R7, lasx64
+lasx256_loop:
+ ADDV $-256, R6
+ XVMOVQ 0(R4), X0
+ XVMOVQ 32(R4), X1
+ XVMOVQ 64(R4), X2
+ XVMOVQ 96(R4), X3
+ XVMOVQ 128(R4), X4
+ XVMOVQ 160(R4), X5
+ XVMOVQ 192(R4), X6
+ XVMOVQ 224(R4), X7
+ XVMOVQ 0(R5), X8
+ XVMOVQ 32(R5), X9
+ XVMOVQ 64(R5), X10
+ XVMOVQ 96(R5), X11
+ XVMOVQ 128(R5), X12
+ XVMOVQ 160(R5), X13
+ XVMOVQ 192(R5), X14
+ XVMOVQ 224(R5), X15
+ XVSEQV X0, X8, X0
+ XVSEQV X1, X9, X1
+ XVSEQV X2, X10, X2
+ XVSEQV X3, X11, X3
+ XVSEQV X4, X12, X4
+ XVSEQV X5, X13, X5
+ XVSEQV X6, X14, X6
+ XVSEQV X7, X15, X7
+ XVANDV X0, X1, X0
+ XVANDV X2, X3, X2
+ XVANDV X4, X5, X4
+ XVANDV X6, X7, X6
+ XVANDV X0, X2, X0
+ XVANDV X4, X6, X4
+ XVANDV X0, X4, X0
+ XVSETALLNEV X0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $256, R4
+ ADDV $256, R5
+ BGE R6, R7, lasx256_loop
+
+lasx64:
+ MOVV $64, R7
+ BLT R6, R7, tail
+lasx64_loop:
+ ADDV $-64, R6
+ XVMOVQ 0(R4), X0
+ XVMOVQ 32(R4), X1
+ XVMOVQ 0(R5), X2
+ XVMOVQ 32(R5), X3
+ XVSEQV X0, X2, X0
+ XVSEQV X1, X3, X1
+ XVANDV X0, X1, X0
+ XVSETALLNEV X0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $64, R4
+ ADDV $64, R5
+ BGE R6, R7, lasx64_loop
+ JMP tail
+
+ // Implemented using 128-bit SIMD instructions
+lsx:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+ BEQ R7, generic64_loop
+
+lsx128:
+ MOVV $128, R7
+ BLT R6, R7, lsx32
+lsx128_loop:
+ ADDV $-128, R6
+ VMOVQ 0(R4), V0
+ VMOVQ 16(R4), V1
+ VMOVQ 32(R4), V2
+ VMOVQ 48(R4), V3
+ VMOVQ 64(R4), V4
+ VMOVQ 80(R4), V5
+ VMOVQ 96(R4), V6
+ VMOVQ 112(R4), V7
+ VMOVQ 0(R5), V8
+ VMOVQ 16(R5), V9
+ VMOVQ 32(R5), V10
+ VMOVQ 48(R5), V11
+ VMOVQ 64(R5), V12
+ VMOVQ 80(R5), V13
+ VMOVQ 96(R5), V14
+ VMOVQ 112(R5), V15
+ VSEQV V0, V8, V0
+ VSEQV V1, V9, V1
+ VSEQV V2, V10, V2
+ VSEQV V3, V11, V3
+ VSEQV V4, V12, V4
+ VSEQV V5, V13, V5
+ VSEQV V6, V14, V6
+ VSEQV V7, V15, V7
+ VANDV V0, V1, V0
+ VANDV V2, V3, V2
+ VANDV V4, V5, V4
+ VANDV V6, V7, V6
+ VANDV V0, V2, V0
+ VANDV V4, V6, V4
+ VANDV V0, V4, V0
+ VSETALLNEV V0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+
+ ADDV $128, R4
+ ADDV $128, R5
+ BGE R6, R7, lsx128_loop
+
+lsx32:
+ MOVV $32, R7
+ BLT R6, R7, tail
+lsx32_loop:
+ ADDV $-32, R6
+ VMOVQ 0(R4), V0
+ VMOVQ 16(R4), V1
+ VMOVQ 0(R5), V2
+ VMOVQ 16(R5), V3
+ VSEQV V0, V2, V0
+ VSEQV V1, V3, V1
+ VANDV V0, V1, V0
+ VSETALLNEV V0, FCC0
+ BFPF not_equal
+ BEQ R6, equal
+ ADDV $32, R4
+ ADDV $32, R5
+ BGE R6, R7, lsx32_loop
+ JMP tail
+
+ // Implemented using general instructions
+generic64_loop:
+ ADDV $-64, R6
+ MOVV 0(R4), R7
+ MOVV 8(R4), R8
+ MOVV 16(R4), R9
+ MOVV 24(R4), R10
+ MOVV 0(R5), R15
+ MOVV 8(R5), R16
+ MOVV 16(R5), R17
+ MOVV 24(R5), R18
+ BNE R7, R15, not_equal
+ BNE R8, R16, not_equal
+ BNE R9, R17, not_equal
+ BNE R10, R18, not_equal
+ MOVV 32(R4), R11
+ MOVV 40(R4), R12
+ MOVV 48(R4), R13
+ MOVV 56(R4), R14
+ MOVV 32(R5), R19
+ MOVV 40(R5), R20
+ MOVV 48(R5), R21
+ MOVV 56(R5), R23
+ BNE R11, R19, not_equal
+ BNE R12, R20, not_equal
+ BNE R13, R21, not_equal
+ BNE R14, R23, not_equal
+ BEQ R6, equal
+ ADDV $64, R4
+ ADDV $64, R5
+ MOVV $64, R7
+ BGE R6, R7, generic64_loop
+ JMP tail