]> Cypherpunks repositories - gostls13.git/commit
runtime: Improvement in perf of s390x memclr
authorkmvijay <kiran.m.vijay@ibm.com>
Thu, 3 Apr 2025 05:58:30 +0000 (05:58 +0000)
committerGopher Robot <gobot@golang.org>
Wed, 14 May 2025 21:33:09 +0000 (14:33 -0700)
commita177448765744010aabb1d9c0fc0de0435d60dac
treee34440fd2914aa104ccbaff28508eacc70eff9d6
parentac341b8e6bc1eb99ddd62c3dea293e41bc582c10
runtime: Improvement in perf of s390x memclr

Memclr routine of s390x architecture is now implemented with vector operations.
And loop unrolling is used for larger sizes.

goos: linux
goarch: s390x
pkg: runtime
                        |    old.txt    |            new_final.txt             |
                        |    sec/op     |    sec/op     vs base                |
Memclr/5                   2.485n ±  5%   2.421n ±  0%   -2.54% (p=0.000 n=10)
Memclr/16                  3.037n ±  2%   2.969n ±  0%   -2.26% (p=0.001 n=10)
Memclr/64                  9.623n ±  0%   4.455n ±  1%  -53.70% (p=0.000 n=10)
Memclr/256                 3.347n ±  3%   3.312n ±  4%        ~ (p=0.670 n=10)
Memclr/4096                15.53n ±  0%   15.54n ±  0%   +0.06% (p=0.000 n=10)
Memclr/65536               329.8n ±  2%   228.4n ±  0%  -30.74% (p=0.000 n=10)
Memclr/1M                  13.09µ ±  0%   12.78µ ±  0%   -2.34% (p=0.000 n=10)
Memclr/4M                  52.33µ ±  0%   51.16µ ±  0%   -2.24% (p=0.000 n=10)
Memclr/8M                  104.6µ ±  0%   102.3µ ±  0%   -2.20% (p=0.000 n=10)
Memclr/16M                 209.4µ ±  0%   204.9µ ±  0%   -2.17% (p=0.000 n=10)
Memclr/64M                 977.8µ ±  0%   967.8µ ±  0%   -1.02% (p=0.000 n=10)
MemclrUnaligned/0_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/0_16       3.957n ±  0%   3.958n ±  0%        ~ (p=0.325 n=10)
MemclrUnaligned/0_64      11.550n ±  0%   5.139n ±  0%  -55.51% (p=0.000 n=10)
MemclrUnaligned/0_256      4.288n ±  0%   4.025n ±  4%   -6.14% (p=0.000 n=10)
MemclrUnaligned/0_4096     15.53n ±  0%   15.53n ±  0%        ~ (p=1.000 n=10)
MemclrUnaligned/0_65536    318.3n ±  1%   233.9n ±  0%  -26.52% (p=0.000 n=10)
MemclrUnaligned/1_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/1_16       3.965n ±  0%   3.969n ±  0%   +0.10% (p=0.000 n=10)
MemclrUnaligned/1_64      11.550n ±  0%   5.109n ±  0%  -55.76% (p=0.000 n=10)
MemclrUnaligned/1_256      4.385n ±  0%   4.174n ±  1%   -4.80% (p=0.000 n=10)
MemclrUnaligned/1_4096     26.23n ±  0%   26.24n ±  0%   +0.04% (p=0.005 n=10)
MemclrUnaligned/1_65536    570.5n ±  0%   401.3n ±  0%  -29.66% (p=0.000 n=10)
MemclrUnaligned/4_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/4_16       3.965n ±  0%   3.973n ±  1%   +0.19% (p=0.000 n=10)
MemclrUnaligned/4_64      11.550n ±  0%   5.131n ±  0%  -55.58% (p=0.000 n=10)
MemclrUnaligned/4_256      4.419n ±  0%   4.187n ±  1%   -5.25% (p=0.000 n=10)
MemclrUnaligned/4_4096     26.23n ±  0%   26.24n ±  0%   +0.04% (p=0.011 n=10)
MemclrUnaligned/4_65536    570.5n ±  0%   401.2n ±  0%  -29.67% (p=0.000 n=10)
MemclrUnaligned/7_5        3.397n ±  0%   3.657n ±  0%   +7.65% (p=0.000 n=10)
MemclrUnaligned/7_16       3.965n ±  0%   3.969n ±  0%   +0.10% (p=0.000 n=10)
MemclrUnaligned/7_64      11.550n ±  0%   5.120n ±  0%  -55.67% (p=0.000 n=10)
MemclrUnaligned/7_256      4.407n ±  0%   4.188n ±  2%   -4.99% (p=0.000 n=10)
MemclrUnaligned/7_4096     26.24n ±  0%   26.24n ±  0%        ~ (p=1.000 n=10)
MemclrUnaligned/7_65536    570.8n ±  0%   401.3n ±  0%  -29.69% (p=0.000 n=10)
MemclrUnaligned/0_1M       13.08µ ±  0%   12.81µ ±  0%   -2.06% (p=0.000 n=10)
MemclrUnaligned/0_4M       52.28µ ±  0%   51.13µ ±  0%   -2.21% (p=0.000 n=10)
MemclrUnaligned/0_8M       104.6µ ±  0%   102.3µ ±  0%   -2.18% (p=0.000 n=10)
MemclrUnaligned/0_16M      209.5µ ±  0%   204.8µ ±  0%   -2.24% (p=0.000 n=10)
MemclrUnaligned/0_64M      977.7µ ±  0%   969.1µ ±  0%   -0.88% (p=0.000 n=10)
MemclrUnaligned/1_1M       17.49µ ±  0%   16.04µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/1_4M       69.92µ ±  0%   64.13µ ±  0%   -8.28% (p=0.000 n=10)
MemclrUnaligned/1_8M       139.8µ ±  0%   128.2µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/1_16M      279.9µ ±  0%   256.1µ ±  0%   -8.50% (p=0.000 n=10)
MemclrUnaligned/1_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrUnaligned/4_1M       17.50µ ±  0%   16.04µ ±  0%   -8.33% (p=0.000 n=10)
MemclrUnaligned/4_4M       69.93µ ±  0%   64.12µ ±  0%   -8.30% (p=0.000 n=10)
MemclrUnaligned/4_8M       139.8µ ±  0%   128.2µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/4_16M      280.2µ ±  0%   256.2µ ±  0%   -8.55% (p=0.000 n=10)
MemclrUnaligned/4_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrUnaligned/7_1M       17.50µ ±  0%   16.04µ ±  0%   -8.35% (p=0.000 n=10)
MemclrUnaligned/7_4M       69.92µ ±  0%   64.13µ ±  0%   -8.28% (p=0.000 n=10)
MemclrUnaligned/7_8M       139.8µ ±  0%   128.2µ ±  0%   -8.34% (p=0.000 n=10)
MemclrUnaligned/7_16M      279.6µ ±  0%   256.2µ ±  0%   -8.35% (p=0.000 n=10)
MemclrUnaligned/7_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrRange/1K_2K          1.053µ ±  0%   1.020µ ±  1%   -3.09% (p=0.000 n=10)
MemclrRange/2K_8K          1.552µ ±  0%   1.570µ ± 12%        ~ (p=0.137 n=10)
MemclrRange/4K_16K         1.283µ ±  0%   1.250µ ±  0%   -2.61% (p=0.000 n=10)
MemclrRange/160K_228K      20.62µ ±  0%   19.86µ ±  0%   -3.70% (p=0.000 n=10)
MemclrKnownSize1           1.732n ±  0%   1.732n ±  0%        ~ (p=1.000 n=10)
MemclrKnownSize2           1.925n ± 34%   1.967n ±  8%        ~ (p=0.080 n=10)
MemclrKnownSize4           1.808n ±  3%   1.732n ±  0%   -4.20% (p=0.000 n=10)
MemclrKnownSize8           2.002n ±  9%   1.773n ±  5%  -11.46% (p=0.000 n=10)
MemclrKnownSize16          2.880n ±  5%   2.461n ±  5%  -14.53% (p=0.000 n=10)
MemclrKnownSize32          8.082n ±  0%   2.838n ±  5%  -64.88% (p=0.000 n=10)
MemclrKnownSize64          8.083n ±  0%   4.960n ±  4%  -38.63% (p=0.000 n=10)
MemclrKnownSize112         8.082n ±  0%   5.533n ±  1%  -31.53% (p=0.000 n=10)
MemclrKnownSize128         8.082n ±  0%   5.534n ±  1%  -31.54% (p=0.000 n=10)
MemclrKnownSize192         8.082n ±  0%   6.833n ±  2%  -15.45% (p=0.000 n=10)
MemclrKnownSize248         8.082n ±  0%   7.165n ±  1%  -11.34% (p=0.000 n=10)
MemclrKnownSize256         2.995n ±  6%   3.226n ±  4%   +7.70% (p=0.006 n=10)
MemclrKnownSize512         3.356n ±  8%   3.595n ±  3%   +7.14% (p=0.007 n=10)
MemclrKnownSize1024        4.664n ±  0%   4.665n ±  0%        ~ (p=0.426 n=10)
MemclrKnownSize4096        15.80n ±  4%   15.15n ±  0%        ~ (p=0.449 n=10)
MemclrKnownSize512KiB      6.543µ ±  0%   6.380µ ±  0%   -2.48% (p=0.000 n=10)
geomean                    327.2n         286.6n        -12.42%

Change-Id: I0f8450743e2f7e736c5ff96a316a8b5d98b27222
Reviewed-on: https://go-review.googlesource.com/c/go/+/662475
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
src/runtime/memclr_s390x.s