runtime: optimize the function memmove on loong64

author Xiaolin Zhao <zhaoxiaolin@loongson.cn>

Tue, 14 May 2024 02:59:26 +0000 (10:59 +0800)

committer Gopher Robot <gobot@golang.org>

Wed, 11 Sep 2024 21:43:27 +0000 (21:43 +0000)
author Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Tue, 14 May 2024 02:59:26 +0000 (10:59 +0800)
committer Gopher Robot <gobot@golang.org>
Wed, 11 Sep 2024 21:43:27 +0000 (21:43 +0000)
diff --git a/src/runtime/memmove_loong64.s b/src/runtime/memmove_loong64.s

index a94cf999bc4af9120478f0e6e8d7e60d9a962157..8827ca0742a4e39a17e65fd8af8b424fc81f83ae 100644 (file)
--- a/src/runtime/memmove_loong64.s
+++ b/src/runtime/memmove_loong64.s
@@ -6,99 +6,266 @@
  
  // See memmove Go doc for important implementation constraints.
  
+// Register map
+//
+// to          R4
+// from                R5
+// n(aka count)        R6
+// to-end      R7
+// from-end    R8
+// data                R11-R18
+// tmp         R9
+
+// Algorithm:
+//
+// Memory alignment check is only performed for copy size greater
+// than 64 bytes to minimize overhead.
+//
+// when copy size <= 64 bytes, jump to label tail, according to the
+// copy size to select the appropriate case and copy directly.
+// Based on the common memory access instructions of loong64, the
+// currently implemented cases are:
+// move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
+// move_9through16, move_17through32, move_33through64
+//
+// when copy size > 64 bytes, use the destination-aligned copying,
+// adopt the following strategy to copy in 3 parts:
+// 1. Head: do the memory alignment
+// 2. Body: a 64-byte loop structure
+// 3. Tail: processing of the remaining part (<= 64 bytes)
+//
+// forward:
+//
+//    Dst           NewDst                           Dstend
+//     |               |<----count after correction---->|
+//     |<-------------count before correction---------->|
+//     |<--8-(Dst&7)-->|               |<---64 bytes--->|
+//     +------------------------------------------------+
+//     |   Head        |      Body     |      Tail      |
+//     +---------------+---------------+----------------+
+//    NewDst = Dst - (Dst & 7) + 8
+//    count = count - 8 + (Dst & 7)
+//    Src = Src - (Dst & 7) + 8
+//
+// backward:
+//
+//    Dst                             NewDstend          Dstend
+//     |<-----count after correction------>|                |
+//     |<------------count before correction--------------->|
+//     |<---64 bytes--->|                  |<---Dstend&7--->|
+//     +----------------------------------------------------+
+//     |   Tail         |      Body        |      Head      |
+//     +----------------+------------------+----------------+
+//    NewDstend = Dstend - (Dstend & 7)
+//    count = count - (Dstend & 7)
+//    Srcend = Srcend - (Dstend & 7)
+
  // func memmove(to, from unsafe.Pointer, n uintptr)
  TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
-       BNE     R6, check
-       RET
+       BEQ     R4, R5, move_0
+       BEQ     R6, move_0
  
-check:
-       SGTU    R4, R5, R7
-       BNE     R7, backward
+       ADDV    R4, R6, R7 // to-end pointer
+       ADDV    R5, R6, R8 // from-end pointer
  
-       ADDV    R4, R6, R9 // end pointer
+tail:
+       //copy size <= 64 bytes, copy directly, not check aligned
  
-       // if the two pointers are not of same alignments, do byte copying
-       SUBVU   R5, R4, R7
-       AND     $7, R7
-       BNE     R7, out
+       // < 2 bytes
+       SGTU    $2, R6, R9
+       BNE     R9, move_1
  
-       // if less than 8 bytes, do byte copying
-       SGTU    $8, R6, R7
-       BNE     R7, out
+       // < 3 bytes
+       SGTU    $3, R6, R9
+       BNE     R9, move_2
  
-       // do one byte at a time until 8-aligned
-       AND     $7, R4, R8
-       BEQ     R8, words
-       MOVB    (R5), R7
-       ADDV    $1, R5
-       MOVB    R7, (R4)
-       ADDV    $1, R4
-       JMP     -6(PC)
-
-words:
-       // do 8 bytes at a time if there is room
-       ADDV    $-7, R9, R6 // R6 is end pointer-7
-
-       PCALIGN $16
-       SGTU    R6, R4, R8
-       BEQ     R8, out
-       MOVV    (R5), R7
-       ADDV    $8, R5
-       MOVV    R7, (R4)
-       ADDV    $8, R4
-       JMP     -6(PC)
-
-out:
-       BEQ     R4, R9, done
-       MOVB    (R5), R7
+       // < 4 bytes
+       SGTU    $4, R6, R9
+       BNE     R9, move_3
+
+       // < 5 bytes
+       SGTU    $5, R6, R9
+       BNE     R9, move_4
+
+       // >= 5 bytes and < 8 bytes
+       SGTU    $8, R6, R9
+       BNE     R9, move_5through7
+
+       // < 9 bytes
+       SGTU    $9, R6, R9
+       BNE     R9, move_8
+
+       // >= 9 bytes and < 17 bytes
+       SGTU    $17, R6, R9
+       BNE     R9, move_9through16
+
+       // >= 17 bytes and < 33 bytes
+       SGTU    $33, R6, R9
+       BNE     R9, move_17through32
+
+       // >= 33 bytes and < 65 bytes
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
+
+       // if (dst > src) && (dst < src + count), regarded as memory
+       // overlap, jump to backward
+       // else, jump to forward
+       BGEU    R5, R4, forward
+       ADDV    R5, R6, R10
+       BLTU    R4, R10, backward
+
+forward:
+       AND     $7, R4, R9      // dst & 7
+       BEQ     R9, body
+head:
+       MOVV    $8, R10
+       SUBV    R9, R10         // head = 8 - (dst & 7)
+       MOVB    (R5), R11
+       SUBV    $1, R10
         ADDV    $1, R5
-       MOVB    R7, (R4)
+       MOVB    R11, (R4)
         ADDV    $1, R4
-       JMP     -5(PC)
-done:
-       RET
+       BNE     R10, -5(PC)
+       ADDV    R9, R6
+       ADDV    $-8, R6         // newcount = count + (dst & 7) - 8
+       // if newcount < 65 bytes, use move_33through64 to copy is enough
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
  
+body:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    16(R5), R13
+       MOVV    24(R5), R14
+       MOVV    32(R5), R15
+       MOVV    40(R5), R16
+       MOVV    48(R5), R17
+       MOVV    56(R5), R18
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, 16(R4)
+       MOVV    R14, 24(R4)
+       MOVV    R15, 32(R4)
+       MOVV    R16, 40(R4)
+       MOVV    R17, 48(R4)
+       MOVV    R18, 56(R4)
+       ADDV    $-64, R6
+       ADDV    $64, R4
+       ADDV    $64, R5
+       SGTU    $64, R6, R9
+       // if the remaining part >= 64 bytes, jmp to body
+       BEQ     R9, body
+       // if the remaining part == 0 bytes, use move_0 to return
+       BEQ     R6, move_0
+       // if the remaining part in (0, 63] bytes, jmp to tail
+       JMP     tail
+
+// The backward copy algorithm is the same as the forward copy,
+// except for the direction.
  backward:
-       ADDV    R6, R5 // from-end pointer
-       ADDV    R4, R6, R9 // to-end pointer
-
-       // if the two pointers are not of same alignments, do byte copying
-       SUBVU   R9, R5, R7
-       AND     $7, R7
-       BNE     R7, out1
-
-       // if less than 8 bytes, do byte copying
-       SGTU    $8, R6, R7
-       BNE     R7, out1
-
-       // do one byte at a time until 8-aligned
-       AND     $7, R9, R8
-       BEQ     R8, words1
-       ADDV    $-1, R5
-       MOVB    (R5), R7
-       ADDV    $-1, R9
-       MOVB    R7, (R9)
-       JMP     -6(PC)
-
-words1:
-       // do 8 bytes at a time if there is room
-       ADDV    $7, R4, R6 // R6 is start pointer+7
-
-       PCALIGN $16
-       SGTU    R9, R6, R8
-       BEQ     R8, out1
-       ADDV    $-8, R5
-       MOVV    (R5), R7
-       ADDV    $-8, R9
-       MOVV    R7, (R9)
-       JMP     -6(PC)
-
-out1:
-       BEQ     R4, R9, done1
-       ADDV    $-1, R5
-       MOVB    (R5), R7
-       ADDV    $-1, R9
-       MOVB    R7, (R9)
-       JMP     -5(PC)
-done1:
+       AND     $7, R7, R9       // dstend & 7
+       BEQ     R9, b_body
+b_head:
+       MOVV    -8(R8), R11
+       SUBV    R9, R6          // newcount = count - (dstend & 7)
+       SUBV    R9, R8          // newsrcend = srcend - (dstend & 7)
+       MOVV    -8(R8), R12
+       MOVV    R11, -8(R7)
+       SUBV    R9, R7          // newdstend = dstend - (dstend & 7)
+       MOVV    R12, -8(R7)
+       SUBV    $8, R6
+       SUBV    $8, R7
+       SUBV    $8, R8
+       SGTU    $65, R6, R9
+       BNE     R9, move_33through64
+
+b_body:
+       MOVV    -8(R8), R11
+       MOVV    -16(R8), R12
+       MOVV    -24(R8), R13
+       MOVV    -32(R8), R14
+       MOVV    -40(R8), R15
+       MOVV    -48(R8), R16
+       MOVV    -56(R8), R17
+       MOVV    -64(R8), R18
+       MOVV    R11, -8(R7)
+       MOVV    R12, -16(R7)
+       MOVV    R13, -24(R7)
+       MOVV    R14, -32(R7)
+       MOVV    R15, -40(R7)
+       MOVV    R16, -48(R7)
+       MOVV    R17, -56(R7)
+       MOVV    R18, -64(R7)
+       ADDV    $-64, R6
+       ADDV    $-64, R7
+       ADDV    $-64, R8
+       SGTU    $64, R6, R9
+       BEQ     R9, b_body
+       BEQ     R6, move_0
+       JMP     tail
+
+move_0:
+       RET
+
+move_1:
+       MOVB    (R5), R11
+       MOVB    R11, (R4)
+       RET
+move_2:
+       MOVH    (R5), R11
+       MOVH    R11, (R4)
+       RET
+move_3:
+       MOVH    (R5), R11
+       MOVB    -1(R8), R12
+       MOVH    R11, (R4)
+       MOVB    R12, -1(R7)
+       RET
+move_4:
+       MOVW    (R5), R11
+       MOVW    R11, (R4)
+       RET
+move_5through7:
+       MOVW    (R5), R11
+       MOVW    -4(R8), R12
+       MOVW    R11, (R4)
+       MOVW    R12, -4(R7)
+       RET
+move_8:
+       MOVV    (R5), R11
+       MOVV    R11, (R4)
+       RET
+move_9through16:
+       MOVV    (R5), R11
+       MOVV    -8(R8), R12
+       MOVV    R11, (R4)
+       MOVV    R12, -8(R7)
+       RET
+move_17through32:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    -16(R8), R13
+       MOVV    -8(R8), R14
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, -16(R7)
+       MOVV    R14, -8(R7)
+       RET
+move_33through64:
+       MOVV    (R5), R11
+       MOVV    8(R5), R12
+       MOVV    16(R5), R13
+       MOVV    24(R5), R14
+       MOVV    -32(R8), R15
+       MOVV    -24(R8), R16
+       MOVV    -16(R8), R17
+       MOVV    -8(R8), R18
+       MOVV    R11, (R4)
+       MOVV    R12, 8(R4)
+       MOVV    R13, 16(R4)
+       MOVV    R14, 24(R4)
+       MOVV    R15, -32(R7)
+       MOVV    R16, -24(R7)
+       MOVV    R17, -16(R7)
+       MOVV    R18, -8(R7)
         RET
author	Xiaolin Zhao <zhaoxiaolin@loongson.cn>
	Tue, 14 May 2024 02:59:26 +0000 (10:59 +0800)
committer	Gopher Robot <gobot@golang.org>
	Wed, 11 Sep 2024 21:43:27 +0000 (21:43 +0000)