]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: Improvement in perf of s390x memclr
authorkmvijay <kiran.m.vijay@ibm.com>
Thu, 3 Apr 2025 05:58:30 +0000 (05:58 +0000)
committerGopher Robot <gobot@golang.org>
Wed, 14 May 2025 21:33:09 +0000 (14:33 -0700)
Memclr routine of s390x architecture is now implemented with vector operations.
And loop unrolling is used for larger sizes.

goos: linux
goarch: s390x
pkg: runtime
                        |    old.txt    |            new_final.txt             |
                        |    sec/op     |    sec/op     vs base                |
Memclr/5                   2.485n ±  5%   2.421n ±  0%   -2.54% (p=0.000 n=10)
Memclr/16                  3.037n ±  2%   2.969n ±  0%   -2.26% (p=0.001 n=10)
Memclr/64                  9.623n ±  0%   4.455n ±  1%  -53.70% (p=0.000 n=10)
Memclr/256                 3.347n ±  3%   3.312n ±  4%        ~ (p=0.670 n=10)
Memclr/4096                15.53n ±  0%   15.54n ±  0%   +0.06% (p=0.000 n=10)
Memclr/65536               329.8n ±  2%   228.4n ±  0%  -30.74% (p=0.000 n=10)
Memclr/1M                  13.09µ ±  0%   12.78µ ±  0%   -2.34% (p=0.000 n=10)
Memclr/4M                  52.33µ ±  0%   51.16µ ±  0%   -2.24% (p=0.000 n=10)
Memclr/8M                  104.6µ ±  0%   102.3µ ±  0%   -2.20% (p=0.000 n=10)
Memclr/16M                 209.4µ ±  0%   204.9µ ±  0%   -2.17% (p=0.000 n=10)
Memclr/64M                 977.8µ ±  0%   967.8µ ±  0%   -1.02% (p=0.000 n=10)
MemclrUnaligned/0_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/0_16       3.957n ±  0%   3.958n ±  0%        ~ (p=0.325 n=10)
MemclrUnaligned/0_64      11.550n ±  0%   5.139n ±  0%  -55.51% (p=0.000 n=10)
MemclrUnaligned/0_256      4.288n ±  0%   4.025n ±  4%   -6.14% (p=0.000 n=10)
MemclrUnaligned/0_4096     15.53n ±  0%   15.53n ±  0%        ~ (p=1.000 n=10)
MemclrUnaligned/0_65536    318.3n ±  1%   233.9n ±  0%  -26.52% (p=0.000 n=10)
MemclrUnaligned/1_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/1_16       3.965n ±  0%   3.969n ±  0%   +0.10% (p=0.000 n=10)
MemclrUnaligned/1_64      11.550n ±  0%   5.109n ±  0%  -55.76% (p=0.000 n=10)
MemclrUnaligned/1_256      4.385n ±  0%   4.174n ±  1%   -4.80% (p=0.000 n=10)
MemclrUnaligned/1_4096     26.23n ±  0%   26.24n ±  0%   +0.04% (p=0.005 n=10)
MemclrUnaligned/1_65536    570.5n ±  0%   401.3n ±  0%  -29.66% (p=0.000 n=10)
MemclrUnaligned/4_5        3.398n ±  0%   3.657n ±  0%   +7.62% (p=0.000 n=10)
MemclrUnaligned/4_16       3.965n ±  0%   3.973n ±  1%   +0.19% (p=0.000 n=10)
MemclrUnaligned/4_64      11.550n ±  0%   5.131n ±  0%  -55.58% (p=0.000 n=10)
MemclrUnaligned/4_256      4.419n ±  0%   4.187n ±  1%   -5.25% (p=0.000 n=10)
MemclrUnaligned/4_4096     26.23n ±  0%   26.24n ±  0%   +0.04% (p=0.011 n=10)
MemclrUnaligned/4_65536    570.5n ±  0%   401.2n ±  0%  -29.67% (p=0.000 n=10)
MemclrUnaligned/7_5        3.397n ±  0%   3.657n ±  0%   +7.65% (p=0.000 n=10)
MemclrUnaligned/7_16       3.965n ±  0%   3.969n ±  0%   +0.10% (p=0.000 n=10)
MemclrUnaligned/7_64      11.550n ±  0%   5.120n ±  0%  -55.67% (p=0.000 n=10)
MemclrUnaligned/7_256      4.407n ±  0%   4.188n ±  2%   -4.99% (p=0.000 n=10)
MemclrUnaligned/7_4096     26.24n ±  0%   26.24n ±  0%        ~ (p=1.000 n=10)
MemclrUnaligned/7_65536    570.8n ±  0%   401.3n ±  0%  -29.69% (p=0.000 n=10)
MemclrUnaligned/0_1M       13.08µ ±  0%   12.81µ ±  0%   -2.06% (p=0.000 n=10)
MemclrUnaligned/0_4M       52.28µ ±  0%   51.13µ ±  0%   -2.21% (p=0.000 n=10)
MemclrUnaligned/0_8M       104.6µ ±  0%   102.3µ ±  0%   -2.18% (p=0.000 n=10)
MemclrUnaligned/0_16M      209.5µ ±  0%   204.8µ ±  0%   -2.24% (p=0.000 n=10)
MemclrUnaligned/0_64M      977.7µ ±  0%   969.1µ ±  0%   -0.88% (p=0.000 n=10)
MemclrUnaligned/1_1M       17.49µ ±  0%   16.04µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/1_4M       69.92µ ±  0%   64.13µ ±  0%   -8.28% (p=0.000 n=10)
MemclrUnaligned/1_8M       139.8µ ±  0%   128.2µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/1_16M      279.9µ ±  0%   256.1µ ±  0%   -8.50% (p=0.000 n=10)
MemclrUnaligned/1_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrUnaligned/4_1M       17.50µ ±  0%   16.04µ ±  0%   -8.33% (p=0.000 n=10)
MemclrUnaligned/4_4M       69.93µ ±  0%   64.12µ ±  0%   -8.30% (p=0.000 n=10)
MemclrUnaligned/4_8M       139.8µ ±  0%   128.2µ ±  0%   -8.32% (p=0.000 n=10)
MemclrUnaligned/4_16M      280.2µ ±  0%   256.2µ ±  0%   -8.55% (p=0.000 n=10)
MemclrUnaligned/4_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrUnaligned/7_1M       17.50µ ±  0%   16.04µ ±  0%   -8.35% (p=0.000 n=10)
MemclrUnaligned/7_4M       69.92µ ±  0%   64.13µ ±  0%   -8.28% (p=0.000 n=10)
MemclrUnaligned/7_8M       139.8µ ±  0%   128.2µ ±  0%   -8.34% (p=0.000 n=10)
MemclrUnaligned/7_16M      279.6µ ±  0%   256.2µ ±  0%   -8.35% (p=0.000 n=10)
MemclrUnaligned/7_64M      1.250m ±  0%   1.216m ±  0%   -2.73% (p=0.000 n=10)
MemclrRange/1K_2K          1.053µ ±  0%   1.020µ ±  1%   -3.09% (p=0.000 n=10)
MemclrRange/2K_8K          1.552µ ±  0%   1.570µ ± 12%        ~ (p=0.137 n=10)
MemclrRange/4K_16K         1.283µ ±  0%   1.250µ ±  0%   -2.61% (p=0.000 n=10)
MemclrRange/160K_228K      20.62µ ±  0%   19.86µ ±  0%   -3.70% (p=0.000 n=10)
MemclrKnownSize1           1.732n ±  0%   1.732n ±  0%        ~ (p=1.000 n=10)
MemclrKnownSize2           1.925n ± 34%   1.967n ±  8%        ~ (p=0.080 n=10)
MemclrKnownSize4           1.808n ±  3%   1.732n ±  0%   -4.20% (p=0.000 n=10)
MemclrKnownSize8           2.002n ±  9%   1.773n ±  5%  -11.46% (p=0.000 n=10)
MemclrKnownSize16          2.880n ±  5%   2.461n ±  5%  -14.53% (p=0.000 n=10)
MemclrKnownSize32          8.082n ±  0%   2.838n ±  5%  -64.88% (p=0.000 n=10)
MemclrKnownSize64          8.083n ±  0%   4.960n ±  4%  -38.63% (p=0.000 n=10)
MemclrKnownSize112         8.082n ±  0%   5.533n ±  1%  -31.53% (p=0.000 n=10)
MemclrKnownSize128         8.082n ±  0%   5.534n ±  1%  -31.54% (p=0.000 n=10)
MemclrKnownSize192         8.082n ±  0%   6.833n ±  2%  -15.45% (p=0.000 n=10)
MemclrKnownSize248         8.082n ±  0%   7.165n ±  1%  -11.34% (p=0.000 n=10)
MemclrKnownSize256         2.995n ±  6%   3.226n ±  4%   +7.70% (p=0.006 n=10)
MemclrKnownSize512         3.356n ±  8%   3.595n ±  3%   +7.14% (p=0.007 n=10)
MemclrKnownSize1024        4.664n ±  0%   4.665n ±  0%        ~ (p=0.426 n=10)
MemclrKnownSize4096        15.80n ±  4%   15.15n ±  0%        ~ (p=0.449 n=10)
MemclrKnownSize512KiB      6.543µ ±  0%   6.380µ ±  0%   -2.48% (p=0.000 n=10)
geomean                    327.2n         286.6n        -12.42%

Change-Id: I0f8450743e2f7e736c5ff96a316a8b5d98b27222
Reviewed-on: https://go-review.googlesource.com/c/go/+/662475
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/runtime/memclr_s390x.s

index fa657ef66e6b9545417223164a4d163329fc6e1a..656e96998c93e32ac5937d0d41777880a303baeb 100644 (file)
@@ -11,13 +11,13 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT|NOFRAME,$0-16
        MOVD    ptr+0(FP), R4
        MOVD    n+8(FP), R5
 
+       CMPBGE  R5, $32, clearge32
+
 start:
        CMPBLE  R5, $3, clear0to3
        CMPBLE  R5, $7, clear4to7
        CMPBLE  R5, $11, clear8to11
        CMPBLE  R5, $15, clear12to15
-       CMP     R5, $32
-       BGE     clearmt32
        MOVD    $0, 0(R4)
        MOVD    $0, 8(R4)
        ADD     $16, R4
@@ -102,23 +102,130 @@ clear15:
        MOVB    $0, 14(R4)
        RET
 
-clearmt32:
+clearge32:
+       CMP     R5, $4096
+       BLT     clear256Bto4KB
+
+// For size >= 4KB, XC is loop unrolled 16 times (4KB = 256B * 16)
+clearge4KB:
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       CMP     R5, $4096
+       BGE     clearge4KB
+
+clear256Bto4KB:
        CMP     R5, $256
-       BLT     clearlt256
+       BLT     clear32to255
        XC      $256, 0(R4), 0(R4)
        ADD     $256, R4
        ADD     $-256, R5
-       BR      clearmt32
-clearlt256:
+       BR      clear256Bto4KB
+
+clear32to255:
        CMPBEQ  R5, $0, done
-       ADD     $-1, R5
-       EXRL    $memclr_exrl_xc<>(SB), R5
-done:
+       CMPBLT  R5, $32, start
+       CMPBEQ  R5, $32, clear32
+       CMPBLE  R5, $64, clear33to64
+       CMP     R5, $128
+       BLE     clear65to128
+       CMP     R5, $255
+       BLE     clear129to255
+
+clear32:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
        RET
 
-// DO NOT CALL - target for exrl (execute relative long) instruction.
-TEXT memclr_exrl_xc<>(SB),NOSPLIT|NOFRAME,$0-0
-       XC      $1, 0(R4), 0(R4)
-       MOVD    $0, 0(R0)
+clear33to64:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       ADD     $-32, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       RET
+
+clear65to128:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       VST     V1, 32(R4)
+       VST     V1, 48(R4)
+       ADD     $-64, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       VST     V1, 32(R4)(R5)
+       VST     V1, 48(R4)(R5)
+       RET
+
+clear129to255:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       VST     V1, 32(R4)
+       VST     V1, 48(R4)
+       VST     V1, 64(R4)
+       VST     V1, 80(R4)
+       VST     V1, 96(R4)
+       VST     V1, 112(R4)
+       ADD     $-128, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       VST     V1, 32(R4)(R5)
+       VST     V1, 48(R4)(R5)
+       VST     V1, 64(R4)(R5)
+       VST     V1, 80(R4)(R5)
+       VST     V1, 96(R4)(R5)
+       VST     V1, 112(R4)(R5)
+       RET
+
+done:
        RET