// Algorithm:
//
-// 1. when count <= 64 bytes, memory alignment check is omitted.
-// The handling is divided into distinct cases based on the size
-// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
-// clr_8, clr_9through16, clr_17through32, and clr_33through64.
+// 1. if lasx is enabled:
+// THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
+// else if lsx is enabled:
+// THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
+// else
+// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
//
-// 2. when count > 64 bytes, memory alignment check is performed.
-// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
-// then a 64-byte loop is executed to zero out memory.
-// When the number of remaining bytes not cleared is n < 64 bytes,
-// a tail processing is performed, invoking the corresponding case
-// based on the size of n.
+// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
+// The handling is divided into distinct cases based on the size of count:
+// a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
+// clr_9through16, clr_17through32, clr_33through64,
+// b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
+// c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
+// lasx_clr_65through128, lasx_clr_129through256
+//
+// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
+// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
+// a LOOPBLOCKS-byte loop is executed to zero out memory.
+// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
+// processing is performed, invoking the corresponding case based on the size of n.
+//
+// example:
+// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
//
// ptr newptr ptrend
// | |<----count after correction---->|
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
BEQ R5, clr_0
ADDV R4, R5, R6
-
tail:
// <=64 bytes, clear directly, not check aligned
SGTU $2, R5, R7
BNE R7, clr_8
SGTU $17, R5, R7
BNE R7, clr_9through16
+
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+ BNE R7, lasx_tail
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+ BNE R7, lsx_tail
+
SGTU $33, R5, R7
BNE R7, clr_17through32
SGTU $65, R5, R7
BNE R7, clr_33through64
+ JMP clr_large
- // n > 64 bytes, check aligned
- AND $7, R4, R7
- BEQ R7, body
+lasx_tail:
+ // X0 = 0
+ XVXORV X0, X0, X0
+
+ SGTU $33, R5, R7
+ BNE R7, lasx_clr_17through32
+ SGTU $65, R5, R7
+ BNE R7, lasx_clr_33through64
+ SGTU $129, R5, R7
+ BNE R7, lasx_clr_65through128
+ SGTU $257, R5, R7
+ BNE R7, lasx_clr_129through256
+ JMP lasx_clr_large
+
+lsx_tail:
+ // V0 = 0
+ VXORV V0, V0, V0
+
+ SGTU $33, R5, R7
+ BNE R7, lsx_clr_17through32
+ SGTU $65, R5, R7
+ BNE R7, lsx_clr_33through64
+ SGTU $129, R5, R7
+ BNE R7, lsx_clr_65through128
+ JMP lsx_clr_large
+
+ // use simd 256 instructions to implement memclr
+ // n > 256 bytes, check 32-byte alignment
+lasx_clr_large:
+ AND $31, R4, R7
+ BEQ R7, lasx_clr_256loop
+ XVMOVQ X0, (R4)
+ SUBV R7, R4
+ ADDV R7, R5
+ SUBV $32, R5 // newn = n - (32 - (ptr & 31))
+ ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31))
+ SGTU $257, R5, R7
+ BNE R7, lasx_clr_129through256
+lasx_clr_256loop:
+ SUBV $256, R5
+ SGTU $256, R5, R7
+ XVMOVQ X0, 0(R4)
+ XVMOVQ X0, 32(R4)
+ XVMOVQ X0, 64(R4)
+ XVMOVQ X0, 96(R4)
+ XVMOVQ X0, 128(R4)
+ XVMOVQ X0, 160(R4)
+ XVMOVQ X0, 192(R4)
+ XVMOVQ X0, 224(R4)
+ ADDV $256, R4
+ BEQ R7, lasx_clr_256loop
+
+ // remaining_length is 0
+ BEQ R5, clr_0
+
+ // 128 < remaining_length < 256
+ SGTU $129, R5, R7
+ BEQ R7, lasx_clr_129through256
+
+ // 64 < remaining_length <= 128
+ SGTU $65, R5, R7
+ BEQ R7, lasx_clr_65through128
+
+ // 32 < remaining_length <= 64
+ SGTU $33, R5, R7
+ BEQ R7, lasx_clr_33through64
+
+ // 16 < remaining_length <= 32
+ SGTU $17, R5, R7
+ BEQ R7, lasx_clr_17through32
+
+ // 0 < remaining_length <= 16
+ JMP tail
+
+ // use simd 128 instructions to implement memclr
+ // n > 128 bytes, check 16-byte alignment
+lsx_clr_large:
+ // check 16-byte alignment
+ AND $15, R4, R7
+ BEQ R7, lsx_clr_128loop
+ VMOVQ V0, (R4)
+ SUBV R7, R4
+ ADDV R7, R5
+ SUBV $16, R5 // newn = n - (16 - (ptr & 15))
+ ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15))
+ SGTU $129, R5, R7
+ BNE R7, lsx_clr_65through128
+lsx_clr_128loop:
+ SUBV $128, R5
+ SGTU $128, R5, R7
+ VMOVQ V0, 0(R4)
+ VMOVQ V0, 16(R4)
+ VMOVQ V0, 32(R4)
+ VMOVQ V0, 48(R4)
+ VMOVQ V0, 64(R4)
+ VMOVQ V0, 80(R4)
+ VMOVQ V0, 96(R4)
+ VMOVQ V0, 112(R4)
+ ADDV $128, R4
+ BEQ R7, lsx_clr_128loop
-head:
+ // remaining_length is 0
+ BEQ R5, clr_0
+
+ // 64 < remaining_length <= 128
+ SGTU $65, R5, R7
+ BEQ R7, lsx_clr_65through128
+
+ // 32 < remaining_length <= 64
+ SGTU $33, R5, R7
+ BEQ R7, lsx_clr_33through64
+
+ // 16 < remaining_length <= 32
+ SGTU $17, R5, R7
+ BEQ R7, lsx_clr_17through32
+
+ // 0 < remaining_length <= 16
+ JMP tail
+
+ // use general instructions to implement memclr
+ // n > 64 bytes, check 16-byte alignment
+clr_large:
+ AND $7, R4, R7
+ BEQ R7, clr_64loop
MOVV R0, (R4)
SUBV R7, R4
ADDV R7, R5
ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7))
SUBV $8, R5 // newn = n - (8 - (ptr & 7))
- SGTU $65, R5, R7
- BNE R7, clr_33through64
-
-body:
+ MOVV $64, R7
+ BLT R5, R7, clr_33through64
+clr_64loop:
+ SUBV $64, R5
+ SGTU $64, R5, R7
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, 16(R4)
MOVV R0, 40(R4)
MOVV R0, 48(R4)
MOVV R0, 56(R4)
- ADDV $-64, R5
ADDV $64, R4
- SGTU $65, R5, R7
- BEQ R7, body
+ BEQ R7, clr_64loop
+
+ // remaining_length is 0
BEQ R5, clr_0
+
+ // 32 < remaining_length < 64
+ SGTU $33, R5, R7
+ BEQ R7, clr_33through64
+
+ // 16 < remaining_length <= 32
+ SGTU $17, R5, R7
+ BEQ R7, clr_17through32
+
+ // 0 < remaining_length <= 16
JMP tail
clr_0:
MOVV R0, -16(R6)
MOVV R0, -8(R6)
RET
+
+lasx_clr_17through32:
+ VMOVQ V0, 0(R4)
+ VMOVQ V0, -16(R6)
+ RET
+lasx_clr_33through64:
+ XVMOVQ X0, 0(R4)
+ XVMOVQ X0, -32(R6)
+ RET
+lasx_clr_65through128:
+ XVMOVQ X0, 0(R4)
+ XVMOVQ X0, 32(R4)
+ XVMOVQ X0, -64(R6)
+ XVMOVQ X0, -32(R6)
+ RET
+lasx_clr_129through256:
+ XVMOVQ X0, 0(R4)
+ XVMOVQ X0, 32(R4)
+ XVMOVQ X0, 64(R4)
+ XVMOVQ X0, 96(R4)
+ XVMOVQ X0, -128(R6)
+ XVMOVQ X0, -96(R6)
+ XVMOVQ X0, -64(R6)
+ XVMOVQ X0, -32(R6)
+ RET
+
+lsx_clr_17through32:
+ VMOVQ V0, 0(R4)
+ VMOVQ V0, -16(R6)
+ RET
+lsx_clr_33through64:
+ VMOVQ V0, 0(R4)
+ VMOVQ V0, 16(R4)
+ VMOVQ V0, -32(R6)
+ VMOVQ V0, -16(R6)
+ RET
+lsx_clr_65through128:
+ VMOVQ V0, 0(R4)
+ VMOVQ V0, 16(R4)
+ VMOVQ V0, 32(R4)
+ VMOVQ V0, 48(R4)
+ VMOVQ V0, -64(R6)
+ VMOVQ V0, -48(R6)
+ VMOVQ V0, -32(R6)
+ VMOVQ V0, -16(R6)
+ RET