From: kmvijay Date: Thu, 3 Apr 2025 05:58:30 +0000 (+0000) Subject: runtime: Improvement in perf of s390x memclr X-Git-Tag: go1.25rc1~271 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=a177448765744010aabb1d9c0fc0de0435d60dac;p=gostls13.git runtime: Improvement in perf of s390x memclr Memclr routine of s390x architecture is now implemented with vector operations. And loop unrolling is used for larger sizes. goos: linux goarch: s390x pkg: runtime | old.txt | new_final.txt | | sec/op | sec/op vs base | Memclr/5 2.485n ± 5% 2.421n ± 0% -2.54% (p=0.000 n=10) Memclr/16 3.037n ± 2% 2.969n ± 0% -2.26% (p=0.001 n=10) Memclr/64 9.623n ± 0% 4.455n ± 1% -53.70% (p=0.000 n=10) Memclr/256 3.347n ± 3% 3.312n ± 4% ~ (p=0.670 n=10) Memclr/4096 15.53n ± 0% 15.54n ± 0% +0.06% (p=0.000 n=10) Memclr/65536 329.8n ± 2% 228.4n ± 0% -30.74% (p=0.000 n=10) Memclr/1M 13.09µ ± 0% 12.78µ ± 0% -2.34% (p=0.000 n=10) Memclr/4M 52.33µ ± 0% 51.16µ ± 0% -2.24% (p=0.000 n=10) Memclr/8M 104.6µ ± 0% 102.3µ ± 0% -2.20% (p=0.000 n=10) Memclr/16M 209.4µ ± 0% 204.9µ ± 0% -2.17% (p=0.000 n=10) Memclr/64M 977.8µ ± 0% 967.8µ ± 0% -1.02% (p=0.000 n=10) MemclrUnaligned/0_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/0_16 3.957n ± 0% 3.958n ± 0% ~ (p=0.325 n=10) MemclrUnaligned/0_64 11.550n ± 0% 5.139n ± 0% -55.51% (p=0.000 n=10) MemclrUnaligned/0_256 4.288n ± 0% 4.025n ± 4% -6.14% (p=0.000 n=10) MemclrUnaligned/0_4096 15.53n ± 0% 15.53n ± 0% ~ (p=1.000 n=10) MemclrUnaligned/0_65536 318.3n ± 1% 233.9n ± 0% -26.52% (p=0.000 n=10) MemclrUnaligned/1_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/1_16 3.965n ± 0% 3.969n ± 0% +0.10% (p=0.000 n=10) MemclrUnaligned/1_64 11.550n ± 0% 5.109n ± 0% -55.76% (p=0.000 n=10) MemclrUnaligned/1_256 4.385n ± 0% 4.174n ± 1% -4.80% (p=0.000 n=10) MemclrUnaligned/1_4096 26.23n ± 0% 26.24n ± 0% +0.04% (p=0.005 n=10) MemclrUnaligned/1_65536 570.5n ± 0% 401.3n ± 0% -29.66% (p=0.000 n=10) MemclrUnaligned/4_5 3.398n ± 0% 3.657n ± 0% +7.62% (p=0.000 n=10) MemclrUnaligned/4_16 3.965n ± 0% 3.973n ± 1% +0.19% (p=0.000 n=10) MemclrUnaligned/4_64 11.550n ± 0% 5.131n ± 0% -55.58% (p=0.000 n=10) MemclrUnaligned/4_256 4.419n ± 0% 4.187n ± 1% -5.25% (p=0.000 n=10) MemclrUnaligned/4_4096 26.23n ± 0% 26.24n ± 0% +0.04% (p=0.011 n=10) MemclrUnaligned/4_65536 570.5n ± 0% 401.2n ± 0% -29.67% (p=0.000 n=10) MemclrUnaligned/7_5 3.397n ± 0% 3.657n ± 0% +7.65% (p=0.000 n=10) MemclrUnaligned/7_16 3.965n ± 0% 3.969n ± 0% +0.10% (p=0.000 n=10) MemclrUnaligned/7_64 11.550n ± 0% 5.120n ± 0% -55.67% (p=0.000 n=10) MemclrUnaligned/7_256 4.407n ± 0% 4.188n ± 2% -4.99% (p=0.000 n=10) MemclrUnaligned/7_4096 26.24n ± 0% 26.24n ± 0% ~ (p=1.000 n=10) MemclrUnaligned/7_65536 570.8n ± 0% 401.3n ± 0% -29.69% (p=0.000 n=10) MemclrUnaligned/0_1M 13.08µ ± 0% 12.81µ ± 0% -2.06% (p=0.000 n=10) MemclrUnaligned/0_4M 52.28µ ± 0% 51.13µ ± 0% -2.21% (p=0.000 n=10) MemclrUnaligned/0_8M 104.6µ ± 0% 102.3µ ± 0% -2.18% (p=0.000 n=10) MemclrUnaligned/0_16M 209.5µ ± 0% 204.8µ ± 0% -2.24% (p=0.000 n=10) MemclrUnaligned/0_64M 977.7µ ± 0% 969.1µ ± 0% -0.88% (p=0.000 n=10) MemclrUnaligned/1_1M 17.49µ ± 0% 16.04µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/1_4M 69.92µ ± 0% 64.13µ ± 0% -8.28% (p=0.000 n=10) MemclrUnaligned/1_8M 139.8µ ± 0% 128.2µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/1_16M 279.9µ ± 0% 256.1µ ± 0% -8.50% (p=0.000 n=10) MemclrUnaligned/1_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrUnaligned/4_1M 17.50µ ± 0% 16.04µ ± 0% -8.33% (p=0.000 n=10) MemclrUnaligned/4_4M 69.93µ ± 0% 64.12µ ± 0% -8.30% (p=0.000 n=10) MemclrUnaligned/4_8M 139.8µ ± 0% 128.2µ ± 0% -8.32% (p=0.000 n=10) MemclrUnaligned/4_16M 280.2µ ± 0% 256.2µ ± 0% -8.55% (p=0.000 n=10) MemclrUnaligned/4_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrUnaligned/7_1M 17.50µ ± 0% 16.04µ ± 0% -8.35% (p=0.000 n=10) MemclrUnaligned/7_4M 69.92µ ± 0% 64.13µ ± 0% -8.28% (p=0.000 n=10) MemclrUnaligned/7_8M 139.8µ ± 0% 128.2µ ± 0% -8.34% (p=0.000 n=10) MemclrUnaligned/7_16M 279.6µ ± 0% 256.2µ ± 0% -8.35% (p=0.000 n=10) MemclrUnaligned/7_64M 1.250m ± 0% 1.216m ± 0% -2.73% (p=0.000 n=10) MemclrRange/1K_2K 1.053µ ± 0% 1.020µ ± 1% -3.09% (p=0.000 n=10) MemclrRange/2K_8K 1.552µ ± 0% 1.570µ ± 12% ~ (p=0.137 n=10) MemclrRange/4K_16K 1.283µ ± 0% 1.250µ ± 0% -2.61% (p=0.000 n=10) MemclrRange/160K_228K 20.62µ ± 0% 19.86µ ± 0% -3.70% (p=0.000 n=10) MemclrKnownSize1 1.732n ± 0% 1.732n ± 0% ~ (p=1.000 n=10) MemclrKnownSize2 1.925n ± 34% 1.967n ± 8% ~ (p=0.080 n=10) MemclrKnownSize4 1.808n ± 3% 1.732n ± 0% -4.20% (p=0.000 n=10) MemclrKnownSize8 2.002n ± 9% 1.773n ± 5% -11.46% (p=0.000 n=10) MemclrKnownSize16 2.880n ± 5% 2.461n ± 5% -14.53% (p=0.000 n=10) MemclrKnownSize32 8.082n ± 0% 2.838n ± 5% -64.88% (p=0.000 n=10) MemclrKnownSize64 8.083n ± 0% 4.960n ± 4% -38.63% (p=0.000 n=10) MemclrKnownSize112 8.082n ± 0% 5.533n ± 1% -31.53% (p=0.000 n=10) MemclrKnownSize128 8.082n ± 0% 5.534n ± 1% -31.54% (p=0.000 n=10) MemclrKnownSize192 8.082n ± 0% 6.833n ± 2% -15.45% (p=0.000 n=10) MemclrKnownSize248 8.082n ± 0% 7.165n ± 1% -11.34% (p=0.000 n=10) MemclrKnownSize256 2.995n ± 6% 3.226n ± 4% +7.70% (p=0.006 n=10) MemclrKnownSize512 3.356n ± 8% 3.595n ± 3% +7.14% (p=0.007 n=10) MemclrKnownSize1024 4.664n ± 0% 4.665n ± 0% ~ (p=0.426 n=10) MemclrKnownSize4096 15.80n ± 4% 15.15n ± 0% ~ (p=0.449 n=10) MemclrKnownSize512KiB 6.543µ ± 0% 6.380µ ± 0% -2.48% (p=0.000 n=10) geomean 327.2n 286.6n -12.42% Change-Id: I0f8450743e2f7e736c5ff96a316a8b5d98b27222 Reviewed-on: https://go-review.googlesource.com/c/go/+/662475 Reviewed-by: Keith Randall Reviewed-by: Cherry Mui Auto-Submit: Keith Randall LUCI-TryBot-Result: Go LUCI --- diff --git a/src/runtime/memclr_s390x.s b/src/runtime/memclr_s390x.s index fa657ef66e..656e96998c 100644 --- a/src/runtime/memclr_s390x.s +++ b/src/runtime/memclr_s390x.s @@ -11,13 +11,13 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT|NOFRAME,$0-16 MOVD ptr+0(FP), R4 MOVD n+8(FP), R5 + CMPBGE R5, $32, clearge32 + start: CMPBLE R5, $3, clear0to3 CMPBLE R5, $7, clear4to7 CMPBLE R5, $11, clear8to11 CMPBLE R5, $15, clear12to15 - CMP R5, $32 - BGE clearmt32 MOVD $0, 0(R4) MOVD $0, 8(R4) ADD $16, R4 @@ -102,23 +102,130 @@ clear15: MOVB $0, 14(R4) RET -clearmt32: +clearge32: + CMP R5, $4096 + BLT clear256Bto4KB + +// For size >= 4KB, XC is loop unrolled 16 times (4KB = 256B * 16) +clearge4KB: + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + XC $256, 0(R4), 0(R4) + ADD $256, R4 + ADD $-256, R5 + CMP R5, $4096 + BGE clearge4KB + +clear256Bto4KB: CMP R5, $256 - BLT clearlt256 + BLT clear32to255 XC $256, 0(R4), 0(R4) ADD $256, R4 ADD $-256, R5 - BR clearmt32 -clearlt256: + BR clear256Bto4KB + +clear32to255: CMPBEQ R5, $0, done - ADD $-1, R5 - EXRL $memclr_exrl_xc<>(SB), R5 -done: + CMPBLT R5, $32, start + CMPBEQ R5, $32, clear32 + CMPBLE R5, $64, clear33to64 + CMP R5, $128 + BLE clear65to128 + CMP R5, $255 + BLE clear129to255 + +clear32: + VZERO V1 + VST V1, 0(R4) + VST V1, 16(R4) RET -// DO NOT CALL - target for exrl (execute relative long) instruction. -TEXT memclr_exrl_xc<>(SB),NOSPLIT|NOFRAME,$0-0 - XC $1, 0(R4), 0(R4) - MOVD $0, 0(R0) +clear33to64: + VZERO V1 + VST V1, 0(R4) + VST V1, 16(R4) + ADD $-32, R5 + VST V1, 0(R4)(R5) + VST V1, 16(R4)(R5) + RET + +clear65to128: + VZERO V1 + VST V1, 0(R4) + VST V1, 16(R4) + VST V1, 32(R4) + VST V1, 48(R4) + ADD $-64, R5 + VST V1, 0(R4)(R5) + VST V1, 16(R4)(R5) + VST V1, 32(R4)(R5) + VST V1, 48(R4)(R5) + RET + +clear129to255: + VZERO V1 + VST V1, 0(R4) + VST V1, 16(R4) + VST V1, 32(R4) + VST V1, 48(R4) + VST V1, 64(R4) + VST V1, 80(R4) + VST V1, 96(R4) + VST V1, 112(R4) + ADD $-128, R5 + VST V1, 0(R4)(R5) + VST V1, 16(R4)(R5) + VST V1, 32(R4)(R5) + VST V1, 48(R4)(R5) + VST V1, 64(R4)(R5) + VST V1, 80(R4)(R5) + VST V1, 96(R4)(R5) + VST V1, 112(R4)(R5) + RET + +done: RET