runtime: optimizing memclrNoHeapPointers implementation using SIMD on loong64

author Guoqi Chen <chenguoqi@loongson.cn>

Wed, 12 Mar 2025 06:35:51 +0000 (14:35 +0800)

committer abner chenc <chenguoqi@loongson.cn>

Thu, 27 Mar 2025 00:58:32 +0000 (17:58 -0700)
author Guoqi Chen <chenguoqi@loongson.cn>
Wed, 12 Mar 2025 06:35:51 +0000 (14:35 +0800)
committer abner chenc <chenguoqi@loongson.cn>
Thu, 27 Mar 2025 00:58:32 +0000 (17:58 -0700)
diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go

index e81e50f5dfcb483eaa3a95b4a1b734a0a3ac1930..06424642c71ba4abb641393e4473a14bbf7963c9 100644 (file)
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -20,7 +20,8 @@ const (
  
         offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
  
-       offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+       offsetLOONG64HasLSX  = unsafe.Offsetof(cpu.Loong64.HasLSX)
+       offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
  )
  
  var (
diff --git a/src/runtime/memclr_loong64.s b/src/runtime/memclr_loong64.s

index 346b210c8de703eb675d81f915ee710ac542674b..76d8fb56bfd89979d02c534735753c58ec9b678e 100644 (file)
--- a/src/runtime/memclr_loong64.s
+++ b/src/runtime/memclr_loong64.s
@@ -14,17 +14,29 @@
  
  // Algorithm:
  //
-// 1. when count <= 64 bytes, memory alignment check is omitted.
-// The handling is divided into distinct cases based on the size
-// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
-// clr_8, clr_9through16, clr_17through32, and clr_33through64.
+// 1. if lasx is enabled:
+//        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
+//    else if lsx is enabled:
+//        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
+//    else
+//        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
  //
-// 2. when count > 64 bytes, memory alignment check is performed.
-// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
-// then a 64-byte loop is executed to zero out memory.
-// When the number of remaining bytes not cleared is n < 64 bytes,
-// a tail processing is performed, invoking the corresponding case
-// based on the size of n.
+// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
+// The handling is divided into distinct cases based on the size of count:
+//   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
+//      clr_9through16, clr_17through32, clr_33through64,
+//   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
+//   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
+//      lasx_clr_65through128, lasx_clr_129through256
+//
+// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
+// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
+// a LOOPBLOCKS-byte loop is executed to zero out memory.
+// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
+// processing is performed, invoking the corresponding case based on the size of n.
+//
+// example:
+//    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
  //
  //    ptr           newptr                           ptrend
  //     |               |<----count after correction---->|
@@ -40,7 +52,6 @@
  TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
         BEQ     R5, clr_0
         ADDV    R4, R5, R6
-
  tail:
         // <=64 bytes, clear directly, not check aligned
         SGTU    $2, R5, R7
@@ -57,25 +68,152 @@ tail:
         BNE     R7, clr_8
         SGTU    $17, R5, R7
         BNE     R7, clr_9through16
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+       BNE     R7, lasx_tail
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+       BNE     R7, lsx_tail
+
         SGTU    $33, R5, R7
         BNE     R7, clr_17through32
         SGTU    $65, R5, R7
         BNE     R7, clr_33through64
+       JMP     clr_large
  
-       // n > 64 bytes, check aligned
-       AND     $7, R4, R7
-       BEQ     R7, body
+lasx_tail:
+       // X0 = 0
+       XVXORV  X0, X0, X0
+
+       SGTU    $33, R5, R7
+       BNE     R7, lasx_clr_17through32
+       SGTU    $65, R5, R7
+       BNE     R7, lasx_clr_33through64
+       SGTU    $129, R5, R7
+       BNE     R7, lasx_clr_65through128
+       SGTU    $257, R5, R7
+       BNE     R7, lasx_clr_129through256
+       JMP     lasx_clr_large
+
+lsx_tail:
+       // V0 = 0
+       VXORV   V0, V0, V0
+
+       SGTU    $33, R5, R7
+       BNE     R7, lsx_clr_17through32
+       SGTU    $65, R5, R7
+       BNE     R7, lsx_clr_33through64
+       SGTU    $129, R5, R7
+       BNE     R7, lsx_clr_65through128
+       JMP     lsx_clr_large
+
+       // use simd 256 instructions to implement memclr
+       // n > 256 bytes, check 32-byte alignment
+lasx_clr_large:
+       AND     $31, R4, R7
+       BEQ     R7, lasx_clr_256loop
+       XVMOVQ  X0, (R4)
+       SUBV    R7, R4
+       ADDV    R7, R5
+       SUBV    $32, R5 // newn = n - (32 - (ptr & 31))
+       ADDV    $32, R4 // newptr = ptr + (32 - (ptr & 31))
+       SGTU    $257, R5, R7
+       BNE     R7, lasx_clr_129through256
+lasx_clr_256loop:
+       SUBV    $256, R5
+       SGTU    $256, R5, R7
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, 64(R4)
+       XVMOVQ  X0, 96(R4)
+       XVMOVQ  X0, 128(R4)
+       XVMOVQ  X0, 160(R4)
+       XVMOVQ  X0, 192(R4)
+       XVMOVQ  X0, 224(R4)
+       ADDV    $256, R4
+       BEQ     R7, lasx_clr_256loop
+
+       // remaining_length is 0
+       BEQ     R5, clr_0
+
+       // 128 < remaining_length < 256
+       SGTU    $129, R5, R7
+       BEQ     R7, lasx_clr_129through256
+
+       // 64 < remaining_length <= 128
+       SGTU    $65, R5, R7
+       BEQ     R7, lasx_clr_65through128
+
+       // 32 < remaining_length <= 64
+       SGTU    $33, R5, R7
+       BEQ     R7, lasx_clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, lasx_clr_17through32
+
+       // 0 < remaining_length <= 16
+       JMP     tail
+
+       // use simd 128 instructions to implement memclr
+       // n > 128 bytes, check 16-byte alignment
+lsx_clr_large:
+       // check 16-byte alignment
+       AND     $15, R4, R7
+       BEQ     R7, lsx_clr_128loop
+       VMOVQ   V0, (R4)
+       SUBV    R7, R4
+       ADDV    R7, R5
+       SUBV    $16, R5 // newn = n - (16 - (ptr & 15))
+       ADDV    $16, R4 // newptr = ptr + (16 - (ptr & 15))
+       SGTU    $129, R5, R7
+       BNE     R7, lsx_clr_65through128
+lsx_clr_128loop:
+       SUBV    $128, R5
+       SGTU    $128, R5, R7
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, 32(R4)
+       VMOVQ   V0, 48(R4)
+       VMOVQ   V0, 64(R4)
+       VMOVQ   V0, 80(R4)
+       VMOVQ   V0, 96(R4)
+       VMOVQ   V0, 112(R4)
+       ADDV    $128, R4
+       BEQ     R7, lsx_clr_128loop
  
-head:
+       // remaining_length is 0
+       BEQ     R5, clr_0
+
+       // 64 < remaining_length <= 128
+       SGTU    $65, R5, R7
+       BEQ     R7, lsx_clr_65through128
+
+       // 32 < remaining_length <= 64
+       SGTU    $33, R5, R7
+       BEQ     R7, lsx_clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, lsx_clr_17through32
+
+       // 0 < remaining_length <= 16
+       JMP     tail
+
+       // use general instructions to implement memclr
+       // n > 64 bytes, check 16-byte alignment
+clr_large:
+       AND     $7, R4, R7
+       BEQ     R7, clr_64loop
         MOVV    R0, (R4)
         SUBV    R7, R4
         ADDV    R7, R5
         ADDV    $8, R4  // newptr = ptr + (8 - (ptr & 7))
         SUBV    $8, R5  // newn = n - (8 - (ptr & 7))
-       SGTU    $65, R5, R7
-       BNE     R7, clr_33through64
-
-body:
+       MOVV    $64, R7
+       BLT     R5, R7, clr_33through64
+clr_64loop:
+       SUBV    $64, R5
+       SGTU    $64, R5, R7
         MOVV    R0, (R4)
         MOVV    R0, 8(R4)
         MOVV    R0, 16(R4)
@@ -84,11 +222,21 @@ body:
         MOVV    R0, 40(R4)
         MOVV    R0, 48(R4)
         MOVV    R0, 56(R4)
-       ADDV    $-64, R5
         ADDV    $64, R4
-       SGTU    $65, R5, R7
-       BEQ     R7, body
+       BEQ     R7, clr_64loop
+
+       // remaining_length is 0
         BEQ     R5, clr_0
+
+       // 32 < remaining_length < 64
+       SGTU    $33, R5, R7
+       BEQ     R7, clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, clr_17through32
+
+       // 0 < remaining_length <= 16
         JMP     tail
  
  clr_0:
@@ -133,3 +281,49 @@ clr_33through64:
         MOVV    R0, -16(R6)
         MOVV    R0, -8(R6)
         RET
+
+lasx_clr_17through32:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, -16(R6)
+       RET
+lasx_clr_33through64:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, -32(R6)
+       RET
+lasx_clr_65through128:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, -64(R6)
+       XVMOVQ  X0, -32(R6)
+       RET
+lasx_clr_129through256:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, 64(R4)
+       XVMOVQ  X0, 96(R4)
+       XVMOVQ  X0, -128(R6)
+       XVMOVQ  X0, -96(R6)
+       XVMOVQ  X0, -64(R6)
+       XVMOVQ  X0, -32(R6)
+       RET
+
+lsx_clr_17through32:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, -16(R6)
+       RET
+lsx_clr_33through64:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, -32(R6)
+       VMOVQ   V0, -16(R6)
+       RET
+lsx_clr_65through128:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, 32(R4)
+       VMOVQ   V0, 48(R4)
+       VMOVQ   V0, -64(R6)
+       VMOVQ   V0, -48(R6)
+       VMOVQ   V0, -32(R6)
+       VMOVQ   V0, -16(R6)
+       RET
author	Guoqi Chen <chenguoqi@loongson.cn>
	Wed, 12 Mar 2025 06:35:51 +0000 (14:35 +0800)
committer	abner chenc <chenguoqi@loongson.cn>
	Thu, 27 Mar 2025 00:58:32 +0000 (17:58 -0700)
src/runtime/cpuflags.go		patch \| blob \| history
src/runtime/memclr_loong64.s		patch \| blob \| history