]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimize memclr on mips64x
authorJulian Zhu <jz531210@gmail.com>
Tue, 17 Jun 2025 03:37:07 +0000 (11:37 +0800)
committerGopher Robot <gobot@golang.org>
Thu, 24 Jul 2025 17:10:58 +0000 (10:10 -0700)
Memclr/5-4                          49.94n ±  5%         50.51n ±  1%        ~ (p=0.331 n=6)
Memclr/16-4                         22.71n ±  0%         21.01n ±  2%   -7.47% (p=0.002 n=6)
Memclr/64-4                         49.70n ±  1%         26.09n ±  1%  -47.51% (p=0.002 n=6)
Memclr/256-4                        84.23n ±  3%         44.32n ±  2%  -47.38% (p=0.002 n=6)
Memclr/4096-4                       805.6n ±  1%         220.9n ±  2%  -72.57% (p=0.002 n=6)
Memclr/65536-4                     12.734µ ±  1%         3.287µ ±  1%  -74.19% (p=0.002 n=6)
Memclr/1M-4                         209.1µ ±  0%         105.9µ ±  5%  -49.34% (p=0.002 n=6)
Memclr/4M-4                         838.9µ ±  6%         418.2µ ±  0%  -50.15% (p=0.002 n=6)
Memclr/8M-4                         1.708m ±  4%         1.108m ±  4%  -35.15% (p=0.002 n=6)
Memclr/16M-4                        3.458m ±  1%         2.840m ±  3%  -17.88% (p=0.002 n=6)
Memclr/64M-4                        14.05m ±  0%         11.40m ±  2%  -18.87% (p=0.002 n=6)
MemclrUnaligned/0_5-4               50.57n ±  2%         51.00n ±  0%        ~ (p=0.063 n=6)
MemclrUnaligned/0_16-4              48.82n ±  8%         22.39n ±  1%  -54.14% (p=0.002 n=6)
MemclrUnaligned/0_64-4              52.73n ±  3%         25.29n ±  0%  -52.05% (p=0.002 n=6)
MemclrUnaligned/0_256-4             88.41n ±  1%         50.04n ±  7%  -43.41% (p=0.002 n=6)
MemclrUnaligned/0_4096-4            802.2n ±  1%         220.4n ±  1%  -72.53% (p=0.002 n=6)
MemclrUnaligned/0_65536-4          12.729µ ±  0%         3.341µ ±  6%  -73.76% (p=0.002 n=6)
MemclrUnaligned/1_5-4               50.52n ±  0%         50.99n ±  6%   +0.93% (p=0.002 n=6)
MemclrUnaligned/1_16-4              71.23n ±  1%         71.78n ±  1%   +0.77% (p=0.041 n=6)
MemclrUnaligned/1_64-4              85.11n ±  0%         76.30n ±  1%  -10.36% (p=0.002 n=6)
MemclrUnaligned/1_256-4            133.50n ±  2%         91.91n ±  1%  -31.15% (p=0.002 n=6)
MemclrUnaligned/1_4096-4            849.7n ±  0%         291.3n ±  2%  -65.72% (p=0.002 n=6)
MemclrUnaligned/1_65536-4          12.776µ ±  1%         3.399µ ±  1%  -73.40% (p=0.002 n=6)
MemclrUnaligned/4_5-4               44.34n ±  0%         44.52n ±  7%   +0.41% (p=0.022 n=6)
MemclrUnaligned/4_16-4              70.68n ±  0%         71.24n ±  4%        ~ (p=0.132 n=6)
MemclrUnaligned/4_64-4              81.83n ±  4%         77.98n ±  2%   -4.71% (p=0.002 n=6)
MemclrUnaligned/4_256-4            121.15n ±  3%         87.58n ±  0%  -27.71% (p=0.002 n=6)
MemclrUnaligned/4_4096-4            837.0n ±  2%         278.8n ±  3%  -66.69% (p=0.002 n=6)
MemclrUnaligned/4_65536-4          12.793µ ±  6%         3.373µ ±  3%  -73.64% (p=0.002 n=6)
MemclrUnaligned/7_5-4               43.89n ±  2%         43.10n ±  0%   -1.80% (p=0.002 n=6)
MemclrUnaligned/7_16-4              73.59n ±  2%         72.95n ±  1%   -0.86% (p=0.006 n=6)
MemclrUnaligned/7_64-4              88.67n ±  0%         78.89n ±  1%  -11.03% (p=0.002 n=6)
MemclrUnaligned/7_256-4            123.90n ±  1%         85.41n ±  2%  -31.07% (p=0.002 n=6)
MemclrUnaligned/7_4096-4            842.8n ±  2%         268.0n ±  0%  -68.20% (p=0.002 n=6)
MemclrUnaligned/7_65536-4          12.877µ ± 11%         3.348µ ±  0%  -74.00% (p=0.002 n=6)
MemclrUnaligned/0_1M-4              208.4µ ±  5%         104.6µ ±  1%  -49.80% (p=0.002 n=6)
MemclrUnaligned/0_4M-4              836.1µ ±  7%         419.3µ ±  2%  -49.85% (p=0.002 n=6)
MemclrUnaligned/0_8M-4              1.701m ±  9%         1.136m ± 12%  -33.21% (p=0.002 n=6)
MemclrUnaligned/0_16M-4             3.467m ± 16%         2.832m ±  4%  -18.30% (p=0.002 n=6)
MemclrUnaligned/0_64M-4             14.05m ±  2%         11.33m ±  2%  -19.38% (p=0.002 n=6)
MemclrUnaligned/1_1M-4              208.8µ ±  4%         104.7µ ±  1%  -49.85% (p=0.002 n=6)
MemclrUnaligned/1_4M-4              838.0µ ±  0%         418.3µ ±  2%  -50.09% (p=0.002 n=6)
MemclrUnaligned/1_8M-4              1.692m ±  1%         1.108m ±  3%  -34.53% (p=0.002 n=6)
MemclrUnaligned/1_16M-4             3.463m ± 20%         2.833m ±  6%  -18.21% (p=0.002 n=6)
MemclrUnaligned/1_64M-4             14.05m ±  4%         11.35m ±  2%  -19.28% (p=0.002 n=6)
MemclrUnaligned/4_1M-4              209.2µ ±  1%         104.7µ ±  7%  -49.94% (p=0.002 n=6)
MemclrUnaligned/4_4M-4              836.2µ ±  6%         418.8µ ± 15%  -49.91% (p=0.002 n=6)
MemclrUnaligned/4_8M-4              1.702m ±  0%         1.123m ±  4%  -34.01% (p=0.002 n=6)
MemclrUnaligned/4_16M-4             3.476m ±  8%         2.804m ±  2%  -19.34% (p=0.002 n=6)
MemclrUnaligned/4_64M-4             14.13m ± 25%         11.40m ±  0%  -19.33% (p=0.002 n=6)
MemclrUnaligned/7_1M-4              208.9µ ±  8%         104.9µ ±  6%  -49.81% (p=0.002 n=6)
MemclrUnaligned/7_4M-4              845.6µ ± 12%         418.2µ ±  7%  -50.54% (p=0.002 n=6)
MemclrUnaligned/7_8M-4              1.706m ± 10%         1.101m ±  3%  -35.48% (p=0.002 n=6)
MemclrUnaligned/7_16M-4             3.466m ±  3%         2.812m ±  2%  -18.86% (p=0.002 n=6)
MemclrUnaligned/7_64M-4             14.08m ±  5%         11.35m ± 18%  -19.37% (p=0.002 n=6)
GoMemclr/5-4                        49.79n ±  2%         50.34n ±  0%        ~ (p=0.394 n=6)
GoMemclr/16-4                       21.64n ±  0%         22.04n ±  7%   +1.85% (p=0.002 n=6)
GoMemclr/64-4                       47.93n ±  4%         23.77n ±  4%  -50.41% (p=0.002 n=6)
GoMemclr/256-4                      82.77n ±  2%         43.90n ±  0%  -46.96% (p=0.002 n=6)

Change-Id: I272967d001809ac4948e4118df6cdd0e0661ab96
Reviewed-on: https://go-review.googlesource.com/c/go/+/682195
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>

src/runtime/memclr_mips64x.s

index cf3a9c4ab4fb3696860c153e3eec3b14509c7fe4..3df3728146a1077fba6eb1ddaa25f8c74d945f7e 100644 (file)
@@ -71,29 +71,93 @@ msa_large_loop:
 no_msa:
        // if less than 8 bytes, do one byte at a time
        SGTU    $8, R2, R3
-       BNE     R3, out
+       BNE     R3, check4
 
-       // do one byte at a time until 8-aligned
+       // Check alignment
        AND     $7, R1, R3
-       BEQ     R3, words
+       BEQ     R3, aligned
+
+       // Zero one byte at a time until we reach 8 byte alignment.
+       MOVV    $8, R5
+       SUBV    R3, R5, R3
+       SUBV    R3, R2, R2
+align:
+       SUBV    $1, R3
        MOVB    R0, (R1)
        ADDV    $1, R1
-       JMP     -4(PC)
+       BNE     R3, align
 
-words:
-       // do 8 bytes at a time if there is room
-       ADDV    $-7, R4, R2
+aligned:
+       SGTU    $8, R2, R3
+       BNE     R3, check4
+       SGTU    $16, R2, R3
+       BNE     R3, zero8
+       SGTU    $32, R2, R3
+       BNE     R3, zero16
+       SGTU    $64, R2, R3
+       BNE     R3, zero32
+loop64:
+       MOVV    R0, (R1)
+       MOVV    R0, 8(R1)
+       MOVV    R0, 16(R1)
+       MOVV    R0, 24(R1)
+       MOVV    R0, 32(R1)
+       MOVV    R0, 40(R1)
+       MOVV    R0, 48(R1)
+       MOVV    R0, 56(R1)
+       ADDV    $64, R1
+       SUBV    $64, R2
+       SGTU    $64, R2, R3
+       BEQ     R0, R3, loop64
+       BEQ     R2, done
+
+check32:
+       SGTU    $32, R2, R3
+       BNE     R3, check16
+zero32:
+       MOVV    R0, (R1)
+       MOVV    R0, 8(R1)
+       MOVV    R0, 16(R1)
+       MOVV    R0, 24(R1)
+       ADDV    $32, R1
+       SUBV    $32, R2
+       BEQ     R2, done
+
+check16:
+       SGTU    $16, R2, R3
+       BNE     R3, check8
+zero16:
+       MOVV    R0, (R1)
+       MOVV    R0, 8(R1)
+       ADDV    $16, R1
+       SUBV    $16, R2
+       BEQ     R2, done
 
-       SGTU    R2, R1, R3
-       BEQ     R3, out
+check8:
+       SGTU    $8, R2, R3
+       BNE     R3, check4
+zero8:
        MOVV    R0, (R1)
        ADDV    $8, R1
-       JMP     -4(PC)
+       SUBV    $8, R2
+       BEQ     R2, done
 
-out:
+check4:
+       SGTU    $4, R2, R3
+       BNE     R3, loop1
+zero4:
+       MOVB    R0, (R1)
+       MOVB    R0, 1(R1)
+       MOVB    R0, 2(R1)
+       MOVB    R0, 3(R1)
+       ADDV    $4, R1
+       SUBV    $4, R2
+
+loop1:
        BEQ     R1, R4, done
        MOVB    R0, (R1)
        ADDV    $1, R1
-       JMP     -3(PC)
+       JMP     loop1
 done:
        RET
+