]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimizing memclrNoHeapPointers implementation using SIMD on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Wed, 12 Mar 2025 06:35:51 +0000 (14:35 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Thu, 27 Mar 2025 00:58:32 +0000 (17:58 -0700)
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                        |  bench.old   |            bench.new.256             |
                        |    sec/op    |    sec/op     vs base                |
Memclr/5                   3.204n ± 0%    2.804n ± 0%  -12.48% (p=0.000 n=10)
Memclr/16                  3.204n ± 0%    3.204n ± 0%        ~ (p=0.465 n=10)
Memclr/64                  5.267n ± 0%    4.005n ± 0%  -23.96% (p=0.000 n=10)
Memclr/256                10.280n ± 0%    5.400n ± 0%  -47.47% (p=0.000 n=10)
Memclr/4096               107.00n ± 1%    30.24n ± 0%  -71.74% (p=0.000 n=10)
Memclr/65536              1675.0n ± 0%    431.1n ± 0%  -74.26% (p=0.000 n=10)
Memclr/1M                  52.61µ ± 0%    32.82µ ± 0%  -37.62% (p=0.000 n=10)
Memclr/4M                  210.3µ ± 0%    131.3µ ± 0%  -37.59% (p=0.000 n=10)
Memclr/8M                  420.3µ ± 0%    262.5µ ± 0%  -37.54% (p=0.000 n=10)
Memclr/16M                 857.4µ ± 1%    542.9µ ± 3%  -36.68% (p=0.000 n=10)
Memclr/64M                 3.658m ± 3%    2.173m ± 0%  -40.59% (p=0.000 n=10)
MemclrUnaligned/0_5        4.264n ± 1%    4.359n ± 0%   +2.23% (p=0.000 n=10)
MemclrUnaligned/0_16       4.595n ± 0%    4.599n ± 0%   +0.10% (p=0.020 n=10)
MemclrUnaligned/0_64       5.356n ± 0%    5.122n ± 0%   -4.37% (p=0.000 n=10)
MemclrUnaligned/0_256     10.370n ± 0%    5.907n ± 1%  -43.03% (p=0.000 n=10)
MemclrUnaligned/0_4096    107.10n ± 0%    37.35n ± 0%  -65.13% (p=0.000 n=10)
MemclrUnaligned/0_65536   1694.0n ± 0%    441.7n ± 0%  -73.93% (p=0.000 n=10)
MemclrUnaligned/1_5        4.272n ± 0%    4.348n ± 0%   +1.76% (p=0.000 n=10)
MemclrUnaligned/1_16       4.593n ± 0%    4.608n ± 0%   +0.33% (p=0.002 n=10)
MemclrUnaligned/1_64       7.610n ± 0%    5.293n ± 0%  -30.45% (p=0.000 n=10)
MemclrUnaligned/1_256     12.230n ± 0%    9.012n ± 0%  -26.31% (p=0.000 n=10)
MemclrUnaligned/1_4096    114.10n ± 0%    39.50n ± 0%  -65.38% (p=0.000 n=10)
MemclrUnaligned/1_65536   1705.0n ± 0%    468.8n ± 0%  -72.50% (p=0.000 n=10)
MemclrUnaligned/4_5        4.283n ± 1%    4.346n ± 0%   +1.48% (p=0.000 n=10)
MemclrUnaligned/4_16       4.599n ± 0%    4.605n ± 0%   +0.12% (p=0.000 n=10)
MemclrUnaligned/4_64       7.572n ± 1%    5.283n ± 0%  -30.24% (p=0.000 n=10)
MemclrUnaligned/4_256     12.215n ± 0%    9.212n ± 0%  -24.58% (p=0.000 n=10)
MemclrUnaligned/4_4096    114.35n ± 0%    39.48n ± 0%  -65.47% (p=0.000 n=10)
MemclrUnaligned/4_65536   1705.0n ± 0%    469.2n ± 0%  -72.48% (p=0.000 n=10)
MemclrUnaligned/7_5        4.296n ± 1%    4.349n ± 0%   +1.22% (p=0.000 n=10)
MemclrUnaligned/7_16       4.601n ± 0%    4.606n ± 0%   +0.11% (p=0.004 n=10)
MemclrUnaligned/7_64       7.609n ± 0%    5.296n ± 1%  -30.39% (p=0.000 n=10)
MemclrUnaligned/7_256     12.200n ± 0%    9.011n ± 0%  -26.14% (p=0.000 n=10)
MemclrUnaligned/7_4096    114.00n ± 0%    39.51n ± 0%  -65.34% (p=0.000 n=10)
MemclrUnaligned/7_65536   1704.0n ± 0%    469.5n ± 0%  -72.45% (p=0.000 n=10)
MemclrUnaligned/0_1M       52.57µ ± 0%    32.83µ ± 0%  -37.54% (p=0.000 n=10)
MemclrUnaligned/0_4M       210.1µ ± 0%    131.3µ ± 0%  -37.53% (p=0.000 n=10)
MemclrUnaligned/0_8M       420.8µ ± 0%    262.5µ ± 0%  -37.62% (p=0.000 n=10)
MemclrUnaligned/0_16M      846.2µ ± 0%    528.4µ ± 0%  -37.56% (p=0.000 n=10)
MemclrUnaligned/0_64M      3.425m ± 1%    2.187m ± 3%  -36.16% (p=0.000 n=10)
MemclrUnaligned/1_1M       52.56µ ± 0%    32.84µ ± 0%  -37.52% (p=0.000 n=10)
MemclrUnaligned/1_4M       210.5µ ± 0%    131.3µ ± 0%  -37.62% (p=0.000 n=10)
MemclrUnaligned/1_8M       420.5µ ± 0%    262.7µ ± 0%  -37.53% (p=0.000 n=10)
MemclrUnaligned/1_16M      845.2µ ± 0%    528.3µ ± 0%  -37.49% (p=0.000 n=10)
MemclrUnaligned/1_64M      3.381m ± 0%    2.243m ± 3%  -33.66% (p=0.000 n=10)
MemclrUnaligned/4_1M       52.56µ ± 0%    32.85µ ± 0%  -37.50% (p=0.000 n=10)
MemclrUnaligned/4_4M       210.1µ ± 0%    131.3µ ± 0%  -37.49% (p=0.000 n=10)
MemclrUnaligned/4_8M       420.0µ ± 0%    262.6µ ± 0%  -37.48% (p=0.000 n=10)
MemclrUnaligned/4_16M      844.8µ ± 0%    528.7µ ± 0%  -37.41% (p=0.000 n=10)
MemclrUnaligned/4_64M      3.382m ± 1%    2.211m ± 4%  -34.63% (p=0.000 n=10)
MemclrUnaligned/7_1M       52.59µ ± 0%    32.84µ ± 0%  -37.56% (p=0.000 n=10)
MemclrUnaligned/7_4M       210.2µ ± 0%    131.3µ ± 0%  -37.54% (p=0.000 n=10)
MemclrUnaligned/7_8M       420.1µ ± 0%    262.7µ ± 0%  -37.47% (p=0.000 n=10)
MemclrUnaligned/7_16M      845.1µ ± 0%    528.7µ ± 0%  -37.43% (p=0.000 n=10)
MemclrUnaligned/7_64M      3.369m ± 0%    2.313m ± 1%  -31.34% (p=0.000 n=10)
MemclrRange/1K_2K         2707.0n ± 0%    972.4n ± 0%  -64.08% (p=0.000 n=10)
MemclrRange/2K_8K          8.816µ ± 0%    2.519µ ± 0%  -71.43% (p=0.000 n=10)
MemclrRange/4K_16K         8.333µ ± 0%    2.240µ ± 0%  -73.12% (p=0.000 n=10)
MemclrRange/160K_228K      83.47µ ± 0%    31.27µ ± 0%  -62.54% (p=0.000 n=10)
MemclrKnownSize1          0.4003n ± 0%   0.4004n ± 0%        ~ (p=0.119 n=10)
MemclrKnownSize2          0.4003n ± 0%   0.4005n ± 0%        ~ (p=0.069 n=10)
MemclrKnownSize4          0.4003n ± 0%   0.4005n ± 0%        ~ (p=0.100 n=10)
MemclrKnownSize8          0.4003n ± 0%   0.4004n ± 0%   +0.04% (p=0.047 n=10)
MemclrKnownSize16         0.8011n ± 0%   0.8012n ± 0%        ~ (p=0.926 n=10)
MemclrKnownSize32          1.602n ± 0%    1.602n ± 0%        ~ (p=0.772 n=10)
MemclrKnownSize64          2.405n ± 0%    2.404n ± 0%        ~ (p=0.780 n=10)
MemclrKnownSize112         2.804n ± 0%    2.804n ± 0%        ~ (p=0.538 n=10)
MemclrKnownSize128         3.204n ± 0%    3.205n ± 0%        ~ (p=0.105 n=10)
MemclrKnownSize192         4.808n ± 0%    4.807n ± 0%        ~ (p=0.688 n=10)
MemclrKnownSize248         6.347n ± 0%    6.346n ± 0%        ~ (p=0.133 n=10)
MemclrKnownSize256         6.560n ± 0%    6.573n ± 0%   +0.19% (p=0.001 n=10)
MemclrKnownSize512        13.010n ± 0%    6.809n ± 0%  -47.66% (p=0.000 n=10)
MemclrKnownSize1024       25.830n ± 0%    8.412n ± 0%  -67.43% (p=0.000 n=10)
MemclrKnownSize4096       102.70n ± 0%    27.64n ± 0%  -73.09% (p=0.000 n=10)
MemclrKnownSize512KiB      26.30µ ± 0%    16.42µ ± 0%  -37.59% (p=0.000 n=10)
geomean                    629.8n         393.2n       -37.57%

Change-Id: I2b9fe834c31d786d2e30cc02c65a6f9c455c4e8d
Reviewed-on: https://go-review.googlesource.com/c/go/+/657835
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
src/runtime/cpuflags.go
src/runtime/memclr_loong64.s

index e81e50f5dfcb483eaa3a95b4a1b734a0a3ac1930..06424642c71ba4abb641393e4473a14bbf7963c9 100644 (file)
@@ -20,7 +20,8 @@ const (
 
        offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
 
-       offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+       offsetLOONG64HasLSX  = unsafe.Offsetof(cpu.Loong64.HasLSX)
+       offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
 )
 
 var (
index 346b210c8de703eb675d81f915ee710ac542674b..76d8fb56bfd89979d02c534735753c58ec9b678e 100644 (file)
 
 // Algorithm:
 //
-// 1. when count <= 64 bytes, memory alignment check is omitted.
-// The handling is divided into distinct cases based on the size
-// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
-// clr_8, clr_9through16, clr_17through32, and clr_33through64.
+// 1. if lasx is enabled:
+//        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
+//    else if lsx is enabled:
+//        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
+//    else
+//        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
 //
-// 2. when count > 64 bytes, memory alignment check is performed.
-// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
-// then a 64-byte loop is executed to zero out memory.
-// When the number of remaining bytes not cleared is n < 64 bytes,
-// a tail processing is performed, invoking the corresponding case
-// based on the size of n.
+// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
+// The handling is divided into distinct cases based on the size of count:
+//   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
+//      clr_9through16, clr_17through32, clr_33through64,
+//   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
+//   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
+//      lasx_clr_65through128, lasx_clr_129through256
+//
+// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
+// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
+// a LOOPBLOCKS-byte loop is executed to zero out memory.
+// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
+// processing is performed, invoking the corresponding case based on the size of n.
+//
+// example:
+//    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
 //
 //    ptr           newptr                           ptrend
 //     |               |<----count after correction---->|
@@ -40,7 +52,6 @@
 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
        BEQ     R5, clr_0
        ADDV    R4, R5, R6
-
 tail:
        // <=64 bytes, clear directly, not check aligned
        SGTU    $2, R5, R7
@@ -57,25 +68,152 @@ tail:
        BNE     R7, clr_8
        SGTU    $17, R5, R7
        BNE     R7, clr_9through16
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+       BNE     R7, lasx_tail
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+       BNE     R7, lsx_tail
+
        SGTU    $33, R5, R7
        BNE     R7, clr_17through32
        SGTU    $65, R5, R7
        BNE     R7, clr_33through64
+       JMP     clr_large
 
-       // n > 64 bytes, check aligned
-       AND     $7, R4, R7
-       BEQ     R7, body
+lasx_tail:
+       // X0 = 0
+       XVXORV  X0, X0, X0
+
+       SGTU    $33, R5, R7
+       BNE     R7, lasx_clr_17through32
+       SGTU    $65, R5, R7
+       BNE     R7, lasx_clr_33through64
+       SGTU    $129, R5, R7
+       BNE     R7, lasx_clr_65through128
+       SGTU    $257, R5, R7
+       BNE     R7, lasx_clr_129through256
+       JMP     lasx_clr_large
+
+lsx_tail:
+       // V0 = 0
+       VXORV   V0, V0, V0
+
+       SGTU    $33, R5, R7
+       BNE     R7, lsx_clr_17through32
+       SGTU    $65, R5, R7
+       BNE     R7, lsx_clr_33through64
+       SGTU    $129, R5, R7
+       BNE     R7, lsx_clr_65through128
+       JMP     lsx_clr_large
+
+       // use simd 256 instructions to implement memclr
+       // n > 256 bytes, check 32-byte alignment
+lasx_clr_large:
+       AND     $31, R4, R7
+       BEQ     R7, lasx_clr_256loop
+       XVMOVQ  X0, (R4)
+       SUBV    R7, R4
+       ADDV    R7, R5
+       SUBV    $32, R5 // newn = n - (32 - (ptr & 31))
+       ADDV    $32, R4 // newptr = ptr + (32 - (ptr & 31))
+       SGTU    $257, R5, R7
+       BNE     R7, lasx_clr_129through256
+lasx_clr_256loop:
+       SUBV    $256, R5
+       SGTU    $256, R5, R7
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, 64(R4)
+       XVMOVQ  X0, 96(R4)
+       XVMOVQ  X0, 128(R4)
+       XVMOVQ  X0, 160(R4)
+       XVMOVQ  X0, 192(R4)
+       XVMOVQ  X0, 224(R4)
+       ADDV    $256, R4
+       BEQ     R7, lasx_clr_256loop
+
+       // remaining_length is 0
+       BEQ     R5, clr_0
+
+       // 128 < remaining_length < 256
+       SGTU    $129, R5, R7
+       BEQ     R7, lasx_clr_129through256
+
+       // 64 < remaining_length <= 128
+       SGTU    $65, R5, R7
+       BEQ     R7, lasx_clr_65through128
+
+       // 32 < remaining_length <= 64
+       SGTU    $33, R5, R7
+       BEQ     R7, lasx_clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, lasx_clr_17through32
+
+       // 0 < remaining_length <= 16
+       JMP     tail
+
+       // use simd 128 instructions to implement memclr
+       // n > 128 bytes, check 16-byte alignment
+lsx_clr_large:
+       // check 16-byte alignment
+       AND     $15, R4, R7
+       BEQ     R7, lsx_clr_128loop
+       VMOVQ   V0, (R4)
+       SUBV    R7, R4
+       ADDV    R7, R5
+       SUBV    $16, R5 // newn = n - (16 - (ptr & 15))
+       ADDV    $16, R4 // newptr = ptr + (16 - (ptr & 15))
+       SGTU    $129, R5, R7
+       BNE     R7, lsx_clr_65through128
+lsx_clr_128loop:
+       SUBV    $128, R5
+       SGTU    $128, R5, R7
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, 32(R4)
+       VMOVQ   V0, 48(R4)
+       VMOVQ   V0, 64(R4)
+       VMOVQ   V0, 80(R4)
+       VMOVQ   V0, 96(R4)
+       VMOVQ   V0, 112(R4)
+       ADDV    $128, R4
+       BEQ     R7, lsx_clr_128loop
 
-head:
+       // remaining_length is 0
+       BEQ     R5, clr_0
+
+       // 64 < remaining_length <= 128
+       SGTU    $65, R5, R7
+       BEQ     R7, lsx_clr_65through128
+
+       // 32 < remaining_length <= 64
+       SGTU    $33, R5, R7
+       BEQ     R7, lsx_clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, lsx_clr_17through32
+
+       // 0 < remaining_length <= 16
+       JMP     tail
+
+       // use general instructions to implement memclr
+       // n > 64 bytes, check 16-byte alignment
+clr_large:
+       AND     $7, R4, R7
+       BEQ     R7, clr_64loop
        MOVV    R0, (R4)
        SUBV    R7, R4
        ADDV    R7, R5
        ADDV    $8, R4  // newptr = ptr + (8 - (ptr & 7))
        SUBV    $8, R5  // newn = n - (8 - (ptr & 7))
-       SGTU    $65, R5, R7
-       BNE     R7, clr_33through64
-
-body:
+       MOVV    $64, R7
+       BLT     R5, R7, clr_33through64
+clr_64loop:
+       SUBV    $64, R5
+       SGTU    $64, R5, R7
        MOVV    R0, (R4)
        MOVV    R0, 8(R4)
        MOVV    R0, 16(R4)
@@ -84,11 +222,21 @@ body:
        MOVV    R0, 40(R4)
        MOVV    R0, 48(R4)
        MOVV    R0, 56(R4)
-       ADDV    $-64, R5
        ADDV    $64, R4
-       SGTU    $65, R5, R7
-       BEQ     R7, body
+       BEQ     R7, clr_64loop
+
+       // remaining_length is 0
        BEQ     R5, clr_0
+
+       // 32 < remaining_length < 64
+       SGTU    $33, R5, R7
+       BEQ     R7, clr_33through64
+
+       // 16 < remaining_length <= 32
+       SGTU    $17, R5, R7
+       BEQ     R7, clr_17through32
+
+       // 0 < remaining_length <= 16
        JMP     tail
 
 clr_0:
@@ -133,3 +281,49 @@ clr_33through64:
        MOVV    R0, -16(R6)
        MOVV    R0, -8(R6)
        RET
+
+lasx_clr_17through32:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, -16(R6)
+       RET
+lasx_clr_33through64:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, -32(R6)
+       RET
+lasx_clr_65through128:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, -64(R6)
+       XVMOVQ  X0, -32(R6)
+       RET
+lasx_clr_129through256:
+       XVMOVQ  X0, 0(R4)
+       XVMOVQ  X0, 32(R4)
+       XVMOVQ  X0, 64(R4)
+       XVMOVQ  X0, 96(R4)
+       XVMOVQ  X0, -128(R6)
+       XVMOVQ  X0, -96(R6)
+       XVMOVQ  X0, -64(R6)
+       XVMOVQ  X0, -32(R6)
+       RET
+
+lsx_clr_17through32:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, -16(R6)
+       RET
+lsx_clr_33through64:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, -32(R6)
+       VMOVQ   V0, -16(R6)
+       RET
+lsx_clr_65through128:
+       VMOVQ   V0, 0(R4)
+       VMOVQ   V0, 16(R4)
+       VMOVQ   V0, 32(R4)
+       VMOVQ   V0, 48(R4)
+       VMOVQ   V0, -64(R6)
+       VMOVQ   V0, -48(R6)
+       VMOVQ   V0, -32(R6)
+       VMOVQ   V0, -16(R6)
+       RET