From b6cf1d94dcca975125873056408091fca0ee92fb Mon Sep 17 00:00:00 2001 From: Julian Zhu Date: Tue, 17 Jun 2025 11:37:07 +0800 Subject: [PATCH] runtime: optimize memclr on mips64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Memclr/5-4 49.94n ± 5% 50.51n ± 1% ~ (p=0.331 n=6) Memclr/16-4 22.71n ± 0% 21.01n ± 2% -7.47% (p=0.002 n=6) Memclr/64-4 49.70n ± 1% 26.09n ± 1% -47.51% (p=0.002 n=6) Memclr/256-4 84.23n ± 3% 44.32n ± 2% -47.38% (p=0.002 n=6) Memclr/4096-4 805.6n ± 1% 220.9n ± 2% -72.57% (p=0.002 n=6) Memclr/65536-4 12.734µ ± 1% 3.287µ ± 1% -74.19% (p=0.002 n=6) Memclr/1M-4 209.1µ ± 0% 105.9µ ± 5% -49.34% (p=0.002 n=6) Memclr/4M-4 838.9µ ± 6% 418.2µ ± 0% -50.15% (p=0.002 n=6) Memclr/8M-4 1.708m ± 4% 1.108m ± 4% -35.15% (p=0.002 n=6) Memclr/16M-4 3.458m ± 1% 2.840m ± 3% -17.88% (p=0.002 n=6) Memclr/64M-4 14.05m ± 0% 11.40m ± 2% -18.87% (p=0.002 n=6) MemclrUnaligned/0_5-4 50.57n ± 2% 51.00n ± 0% ~ (p=0.063 n=6) MemclrUnaligned/0_16-4 48.82n ± 8% 22.39n ± 1% -54.14% (p=0.002 n=6) MemclrUnaligned/0_64-4 52.73n ± 3% 25.29n ± 0% -52.05% (p=0.002 n=6) MemclrUnaligned/0_256-4 88.41n ± 1% 50.04n ± 7% -43.41% (p=0.002 n=6) MemclrUnaligned/0_4096-4 802.2n ± 1% 220.4n ± 1% -72.53% (p=0.002 n=6) MemclrUnaligned/0_65536-4 12.729µ ± 0% 3.341µ ± 6% -73.76% (p=0.002 n=6) MemclrUnaligned/1_5-4 50.52n ± 0% 50.99n ± 6% +0.93% (p=0.002 n=6) MemclrUnaligned/1_16-4 71.23n ± 1% 71.78n ± 1% +0.77% (p=0.041 n=6) MemclrUnaligned/1_64-4 85.11n ± 0% 76.30n ± 1% -10.36% (p=0.002 n=6) MemclrUnaligned/1_256-4 133.50n ± 2% 91.91n ± 1% -31.15% (p=0.002 n=6) MemclrUnaligned/1_4096-4 849.7n ± 0% 291.3n ± 2% -65.72% (p=0.002 n=6) MemclrUnaligned/1_65536-4 12.776µ ± 1% 3.399µ ± 1% -73.40% (p=0.002 n=6) MemclrUnaligned/4_5-4 44.34n ± 0% 44.52n ± 7% +0.41% (p=0.022 n=6) MemclrUnaligned/4_16-4 70.68n ± 0% 71.24n ± 4% ~ (p=0.132 n=6) MemclrUnaligned/4_64-4 81.83n ± 4% 77.98n ± 2% -4.71% (p=0.002 n=6) MemclrUnaligned/4_256-4 121.15n ± 3% 87.58n ± 0% -27.71% (p=0.002 n=6) MemclrUnaligned/4_4096-4 837.0n ± 2% 278.8n ± 3% -66.69% (p=0.002 n=6) MemclrUnaligned/4_65536-4 12.793µ ± 6% 3.373µ ± 3% -73.64% (p=0.002 n=6) MemclrUnaligned/7_5-4 43.89n ± 2% 43.10n ± 0% -1.80% (p=0.002 n=6) MemclrUnaligned/7_16-4 73.59n ± 2% 72.95n ± 1% -0.86% (p=0.006 n=6) MemclrUnaligned/7_64-4 88.67n ± 0% 78.89n ± 1% -11.03% (p=0.002 n=6) MemclrUnaligned/7_256-4 123.90n ± 1% 85.41n ± 2% -31.07% (p=0.002 n=6) MemclrUnaligned/7_4096-4 842.8n ± 2% 268.0n ± 0% -68.20% (p=0.002 n=6) MemclrUnaligned/7_65536-4 12.877µ ± 11% 3.348µ ± 0% -74.00% (p=0.002 n=6) MemclrUnaligned/0_1M-4 208.4µ ± 5% 104.6µ ± 1% -49.80% (p=0.002 n=6) MemclrUnaligned/0_4M-4 836.1µ ± 7% 419.3µ ± 2% -49.85% (p=0.002 n=6) MemclrUnaligned/0_8M-4 1.701m ± 9% 1.136m ± 12% -33.21% (p=0.002 n=6) MemclrUnaligned/0_16M-4 3.467m ± 16% 2.832m ± 4% -18.30% (p=0.002 n=6) MemclrUnaligned/0_64M-4 14.05m ± 2% 11.33m ± 2% -19.38% (p=0.002 n=6) MemclrUnaligned/1_1M-4 208.8µ ± 4% 104.7µ ± 1% -49.85% (p=0.002 n=6) MemclrUnaligned/1_4M-4 838.0µ ± 0% 418.3µ ± 2% -50.09% (p=0.002 n=6) MemclrUnaligned/1_8M-4 1.692m ± 1% 1.108m ± 3% -34.53% (p=0.002 n=6) MemclrUnaligned/1_16M-4 3.463m ± 20% 2.833m ± 6% -18.21% (p=0.002 n=6) MemclrUnaligned/1_64M-4 14.05m ± 4% 11.35m ± 2% -19.28% (p=0.002 n=6) MemclrUnaligned/4_1M-4 209.2µ ± 1% 104.7µ ± 7% -49.94% (p=0.002 n=6) MemclrUnaligned/4_4M-4 836.2µ ± 6% 418.8µ ± 15% -49.91% (p=0.002 n=6) MemclrUnaligned/4_8M-4 1.702m ± 0% 1.123m ± 4% -34.01% (p=0.002 n=6) MemclrUnaligned/4_16M-4 3.476m ± 8% 2.804m ± 2% -19.34% (p=0.002 n=6) MemclrUnaligned/4_64M-4 14.13m ± 25% 11.40m ± 0% -19.33% (p=0.002 n=6) MemclrUnaligned/7_1M-4 208.9µ ± 8% 104.9µ ± 6% -49.81% (p=0.002 n=6) MemclrUnaligned/7_4M-4 845.6µ ± 12% 418.2µ ± 7% -50.54% (p=0.002 n=6) MemclrUnaligned/7_8M-4 1.706m ± 10% 1.101m ± 3% -35.48% (p=0.002 n=6) MemclrUnaligned/7_16M-4 3.466m ± 3% 2.812m ± 2% -18.86% (p=0.002 n=6) MemclrUnaligned/7_64M-4 14.08m ± 5% 11.35m ± 18% -19.37% (p=0.002 n=6) GoMemclr/5-4 49.79n ± 2% 50.34n ± 0% ~ (p=0.394 n=6) GoMemclr/16-4 21.64n ± 0% 22.04n ± 7% +1.85% (p=0.002 n=6) GoMemclr/64-4 47.93n ± 4% 23.77n ± 4% -50.41% (p=0.002 n=6) GoMemclr/256-4 82.77n ± 2% 43.90n ± 0% -46.96% (p=0.002 n=6) Change-Id: I272967d001809ac4948e4118df6cdd0e0661ab96 Reviewed-on: https://go-review.googlesource.com/c/go/+/682195 Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Michael Knyszek Auto-Submit: Michael Knyszek --- src/runtime/memclr_mips64x.s | 88 +++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 12 deletions(-) diff --git a/src/runtime/memclr_mips64x.s b/src/runtime/memclr_mips64x.s index cf3a9c4ab4..3df3728146 100644 --- a/src/runtime/memclr_mips64x.s +++ b/src/runtime/memclr_mips64x.s @@ -71,29 +71,93 @@ msa_large_loop: no_msa: // if less than 8 bytes, do one byte at a time SGTU $8, R2, R3 - BNE R3, out + BNE R3, check4 - // do one byte at a time until 8-aligned + // Check alignment AND $7, R1, R3 - BEQ R3, words + BEQ R3, aligned + + // Zero one byte at a time until we reach 8 byte alignment. + MOVV $8, R5 + SUBV R3, R5, R3 + SUBV R3, R2, R2 +align: + SUBV $1, R3 MOVB R0, (R1) ADDV $1, R1 - JMP -4(PC) + BNE R3, align -words: - // do 8 bytes at a time if there is room - ADDV $-7, R4, R2 +aligned: + SGTU $8, R2, R3 + BNE R3, check4 + SGTU $16, R2, R3 + BNE R3, zero8 + SGTU $32, R2, R3 + BNE R3, zero16 + SGTU $64, R2, R3 + BNE R3, zero32 +loop64: + MOVV R0, (R1) + MOVV R0, 8(R1) + MOVV R0, 16(R1) + MOVV R0, 24(R1) + MOVV R0, 32(R1) + MOVV R0, 40(R1) + MOVV R0, 48(R1) + MOVV R0, 56(R1) + ADDV $64, R1 + SUBV $64, R2 + SGTU $64, R2, R3 + BEQ R0, R3, loop64 + BEQ R2, done + +check32: + SGTU $32, R2, R3 + BNE R3, check16 +zero32: + MOVV R0, (R1) + MOVV R0, 8(R1) + MOVV R0, 16(R1) + MOVV R0, 24(R1) + ADDV $32, R1 + SUBV $32, R2 + BEQ R2, done + +check16: + SGTU $16, R2, R3 + BNE R3, check8 +zero16: + MOVV R0, (R1) + MOVV R0, 8(R1) + ADDV $16, R1 + SUBV $16, R2 + BEQ R2, done - SGTU R2, R1, R3 - BEQ R3, out +check8: + SGTU $8, R2, R3 + BNE R3, check4 +zero8: MOVV R0, (R1) ADDV $8, R1 - JMP -4(PC) + SUBV $8, R2 + BEQ R2, done -out: +check4: + SGTU $4, R2, R3 + BNE R3, loop1 +zero4: + MOVB R0, (R1) + MOVB R0, 1(R1) + MOVB R0, 2(R1) + MOVB R0, 3(R1) + ADDV $4, R1 + SUBV $4, R2 + +loop1: BEQ R1, R4, done MOVB R0, (R1) ADDV $1, R1 - JMP -3(PC) + JMP loop1 done: RET + -- 2.51.0