]> Cypherpunks repositories - gostls13.git/commit
runtime: improvement in memclr for s390x
authorkmvijay <kiran.m.vijay@ibm.com>
Wed, 18 Jun 2025 10:57:46 +0000 (10:57 +0000)
committerGopher Robot <gobot@golang.org>
Thu, 24 Jul 2025 17:09:39 +0000 (10:09 -0700)
commita8edd994792a6426d3f28ef4c85b6bb3f0d7ec05
tree5ca2e3917be14a1f2c1039dd3e9a7305308929f3
parentbd04f65511791860276f2f3f982133f7be007448
runtime: improvement in memclr for s390x

The unrolled loop for sizes >= 4KB is further optimized.
Offsets are computed and included in the XC instruction directly.
This reduces code size and instructions, improves performance.

goos: linux
goarch: s390x
pkg: runtime
                        | Orig_Memclr_for_benchstat_2.log | MM_Memclr_for_benchstat_No_VSTL_3.log  |
                        |             sec/op              |    sec/op     vs base                  |
Memclr/5                                     1.925n ±  0%   1.925n ±  0%        ~ (p=0.211 n=10)
Memclr/16                                    2.604n ± 13%   2.633n ± 11%        ~ (p=0.912 n=10)
Memclr/64                                    3.598n ±  2%   3.520n ±  5%        ~ (p=0.190 n=10)
Memclr/256                                   3.571n ± 12%   3.538n ± 11%        ~ (p=0.739 n=10)
Memclr/4096                                  15.15n ±  0%   15.14n ±  0%        ~ (p=0.204 n=10)
Memclr/65536                                 226.3n ±  0%   224.9n ±  0%   -0.62% (p=0.000 n=10)
Memclr/1M                                    12.77µ ±  0%   12.60µ ±  0%   -1.35% (p=0.000 n=10)
Memclr/4M                                    51.07µ ±  0%   50.37µ ±  0%   -1.38% (p=0.000 n=10)
Memclr/8M                                    102.1µ ±  0%   100.7µ ±  0%   -1.36% (p=0.000 n=10)
Memclr/16M                                   204.4µ ±  0%   201.6µ ±  0%   -1.35% (p=0.000 n=10)
Memclr/64M                                   965.4µ ±  0%   935.3µ ±  0%   -3.12% (p=0.000 n=10)
MemclrUnaligned/0_5                          2.671n ±  6%   2.618n ±  0%        ~ (p=0.194 n=10)
MemclrUnaligned/0_16                         3.143n ±  6%   2.955n ±  8%        ~ (p=0.089 n=10)
MemclrUnaligned/0_64                         3.622n ±  3%   3.571n ±  2%        ~ (p=0.304 n=10)
MemclrUnaligned/0_256                        3.712n ±  8%   3.653n ±  5%        ~ (p=0.754 n=10)
MemclrUnaligned/0_4096                       15.14n ±  0%   15.14n ±  0%        ~ (p=1.000 n=10) ¹
MemclrUnaligned/0_65536                      231.9n ±  0%   225.2n ±  0%   -2.91% (p=0.000 n=10)
MemclrUnaligned/1_5                          2.620n ±  8%   2.620n ±  0%        ~ (p=0.866 n=10)
MemclrUnaligned/1_16                         3.103n ±  7%   2.933n ±  9%        ~ (p=0.052 n=10)
MemclrUnaligned/1_64                         3.576n ±  3%   3.568n ±  3%        ~ (p=0.748 n=10)
MemclrUnaligned/1_256                        3.744n ±  9%   3.709n ± 10%        ~ (p=0.853 n=10)
MemclrUnaligned/1_4096                       26.23n ±  0%   26.23n ±  0%        ~ (p=1.000 n=10) ¹
MemclrUnaligned/1_65536                      401.1n ±  0%   399.5n ±  0%   -0.40% (p=0.000 n=10)
MemclrUnaligned/4_5                          2.620n ±  6%   2.623n ±  0%        ~ (p=0.985 n=10)
MemclrUnaligned/4_16                         3.095n ±  7%   3.005n ±  9%        ~ (p=0.247 n=10)
MemclrUnaligned/4_64                         3.586n ±  1%   3.578n ±  3%        ~ (p=1.000 n=10)
MemclrUnaligned/4_256                        3.843n ±  5%   3.742n ± 10%        ~ (p=0.971 n=10)
MemclrUnaligned/4_4096                       26.23n ±  0%   26.23n ±  0%        ~ (p=1.000 n=10)
MemclrUnaligned/4_65536                      401.1n ±  0%   399.5n ±  0%   -0.41% (p=0.000 n=10)
MemclrUnaligned/7_5                          2.634n ±  6%   2.644n ±  4%        ~ (p=0.896 n=10)
MemclrUnaligned/7_16                         3.119n ±  7%   3.044n ±  9%        ~ (p=0.529 n=10)
MemclrUnaligned/7_64                         3.568n ±  1%   3.585n ±  3%        ~ (p=0.499 n=10)
MemclrUnaligned/7_256                        3.741n ±  9%   3.629n ±  6%        ~ (p=0.853 n=10)
MemclrUnaligned/7_4096                       26.23n ±  0%   26.23n ±  0%        ~ (p=1.000 n=10) ¹
MemclrUnaligned/7_65536                      401.1n ±  0%   399.4n ±  0%   -0.42% (p=0.000 n=10)
MemclrUnaligned/0_1M                         12.82µ ±  0%   12.60µ ±  0%   -1.70% (p=0.000 n=10)
MemclrUnaligned/0_4M                         51.28µ ±  0%   50.37µ ±  0%   -1.77% (p=0.000 n=10)
MemclrUnaligned/0_8M                         102.5µ ±  0%   100.8µ ±  0%   -1.75% (p=0.000 n=10)
MemclrUnaligned/0_16M                        205.1µ ±  0%   201.7µ ±  0%   -1.62% (p=0.000 n=10)
MemclrUnaligned/0_64M                        965.2µ ±  0%   934.7µ ±  0%   -3.16% (p=0.000 n=10)
MemclrUnaligned/1_1M                         16.02µ ±  0%   15.81µ ±  0%   -1.34% (p=0.000 n=10)
MemclrUnaligned/1_4M                         64.03µ ±  0%   63.20µ ±  0%   -1.29% (p=0.000 n=10)
MemclrUnaligned/1_8M                         128.0µ ±  0%   126.4µ ±  0%   -1.27% (p=0.000 n=10)
MemclrUnaligned/1_16M                        256.3µ ±  0%   253.2µ ±  0%   -1.21% (p=0.000 n=10)
MemclrUnaligned/1_64M                        1.210m ±  0%   1.187m ±  0%   -1.88% (p=0.000 n=10)
MemclrUnaligned/4_1M                         16.03µ ±  0%   15.81µ ±  0%   -1.37% (p=0.000 n=10)
MemclrUnaligned/4_4M                         64.04µ ±  0%   63.20µ ±  0%   -1.31% (p=0.000 n=10)
MemclrUnaligned/4_8M                         128.0µ ±  0%   126.4µ ±  0%   -1.27% (p=0.000 n=10)
MemclrUnaligned/4_16M                        256.1µ ±  0%   253.0µ ±  0%   -1.20% (p=0.000 n=10)
MemclrUnaligned/4_64M                        1.210m ±  0%   1.188m ±  0%   -1.81% (p=0.000 n=10)
MemclrUnaligned/7_1M                         16.02µ ±  0%   15.81µ ±  0%   -1.32% (p=0.000 n=10)
MemclrUnaligned/7_4M                         64.06µ ±  0%   63.21µ ±  0%   -1.34% (p=0.000 n=10)
MemclrUnaligned/7_8M                         128.1µ ±  0%   126.4µ ±  0%   -1.29% (p=0.000 n=10)
MemclrUnaligned/7_16M                        256.2µ ±  0%   253.2µ ±  0%   -1.18% (p=0.000 n=10)
MemclrUnaligned/7_64M                        1.210m ±  0%   1.188m ±  0%   -1.82% (p=0.000 n=10)
MemclrRange/1K_2K                            841.1n ±  1%   879.0n ±  3%   +4.51% (p=0.002 n=10)
MemclrRange/2K_8K                            1.435µ ±  2%   1.415µ ±  0%   -1.39% (p=0.000 n=10)
MemclrRange/4K_16K                           1.241µ ±  0%   1.209µ ±  0%   -2.58% (p=0.000 n=10)
MemclrRange/160K_228K                        19.83µ ±  0%   19.59µ ±  0%   -1.22% (p=0.000 n=10)
MemclrKnownSize1                             1.732n ±  0%   1.732n ±  0%        ~ (p=0.474 n=10)
MemclrKnownSize2                             1.925n ±  3%   1.925n ±  1%        ~ (p=0.929 n=10)
MemclrKnownSize4                             1.732n ±  0%   1.732n ±  0%        ~ (p=1.000 n=10) ¹
MemclrKnownSize8                             1.732n ±  0%   1.732n ±  0%        ~ (p=1.000 n=10)
MemclrKnownSize16                            2.413n ±  9%   2.681n ± 14%  +11.10% (p=0.004 n=10)
MemclrKnownSize32                            3.284n ±  4%   3.328n ±  2%        ~ (p=0.671 n=10)
MemclrKnownSize64                            4.893n ±  1%   4.882n ±  1%        ~ (p=0.591 n=10)
MemclrKnownSize112                           5.623n ±  2%   5.596n ±  2%   -0.48% (p=0.027 n=10)
MemclrKnownSize128                           5.612n ±  1%   5.599n ±  0%        ~ (p=0.066 n=10)
MemclrKnownSize192                           7.128n ±  1%   7.337n ±  2%   +2.93% (p=0.000 n=10)
MemclrKnownSize248                           6.740n ±  1%   6.829n ±  3%   +1.33% (p=0.005 n=10)
MemclrKnownSize256                           3.657n ±  8%   3.512n ± 14%        ~ (p=0.436 n=10)
MemclrKnownSize512                           3.624n ±  3%   3.982n ±  9%   +9.88% (p=0.017 n=10)
MemclrKnownSize1024                          4.662n ±  0%   4.680n ±  0%   +0.39% (p=0.000 n=10)
MemclrKnownSize4096                          15.14n ±  0%   15.15n ±  0%   +0.07% (p=0.000 n=10)
MemclrKnownSize512KiB                        6.388µ ±  0%   6.309µ ±  0%   -1.24% (p=0.000 n=10)
geomean                                      268.9n         266.9n         -0.75%
¹ all samples are equal

Change-Id: I2911866fb82777311ec4219600fb48c85f7bf862
Reviewed-on: https://go-review.googlesource.com/c/go/+/682595
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/runtime/memclr_s390x.s