runtime: Improvement in perf of s390x memclr

author kmvijay <kiran.m.vijay@ibm.com>

Thu, 3 Apr 2025 05:58:30 +0000 (05:58 +0000)

committer Gopher Robot <gobot@golang.org>

Wed, 14 May 2025 21:33:09 +0000 (14:33 -0700)
author kmvijay <kiran.m.vijay@ibm.com>
Thu, 3 Apr 2025 05:58:30 +0000 (05:58 +0000)
committer Gopher Robot <gobot@golang.org>
Wed, 14 May 2025 21:33:09 +0000 (14:33 -0700)
diff --git a/src/runtime/memclr_s390x.s b/src/runtime/memclr_s390x.s

index fa657ef66e6b9545417223164a4d163329fc6e1a..656e96998c93e32ac5937d0d41777880a303baeb 100644 (file)
--- a/src/runtime/memclr_s390x.s
+++ b/src/runtime/memclr_s390x.s
@@ -11,13 +11,13 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT|NOFRAME,$0-16
         MOVD    ptr+0(FP), R4
         MOVD    n+8(FP), R5
  
+       CMPBGE  R5, $32, clearge32
+
  start:
         CMPBLE  R5, $3, clear0to3
         CMPBLE  R5, $7, clear4to7
         CMPBLE  R5, $11, clear8to11
         CMPBLE  R5, $15, clear12to15
-       CMP     R5, $32
-       BGE     clearmt32
         MOVD    $0, 0(R4)
         MOVD    $0, 8(R4)
         ADD     $16, R4
@@ -102,23 +102,130 @@ clear15:
         MOVB    $0, 14(R4)
         RET
  
-clearmt32:
+clearge32:
+       CMP     R5, $4096
+       BLT     clear256Bto4KB
+
+// For size >= 4KB, XC is loop unrolled 16 times (4KB = 256B * 16)
+clearge4KB:
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       XC      $256, 0(R4), 0(R4)
+       ADD     $256, R4
+       ADD     $-256, R5
+       CMP     R5, $4096
+       BGE     clearge4KB
+
+clear256Bto4KB:
         CMP     R5, $256
-       BLT     clearlt256
+       BLT     clear32to255
         XC      $256, 0(R4), 0(R4)
         ADD     $256, R4
         ADD     $-256, R5
-       BR      clearmt32
-clearlt256:
+       BR      clear256Bto4KB
+
+clear32to255:
         CMPBEQ  R5, $0, done
-       ADD     $-1, R5
-       EXRL    $memclr_exrl_xc<>(SB), R5
-done:
+       CMPBLT  R5, $32, start
+       CMPBEQ  R5, $32, clear32
+       CMPBLE  R5, $64, clear33to64
+       CMP     R5, $128
+       BLE     clear65to128
+       CMP     R5, $255
+       BLE     clear129to255
+
+clear32:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
         RET
  
-// DO NOT CALL - target for exrl (execute relative long) instruction.
-TEXT memclr_exrl_xc<>(SB),NOSPLIT|NOFRAME,$0-0
-       XC      $1, 0(R4), 0(R4)
-       MOVD    $0, 0(R0)
+clear33to64:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       ADD     $-32, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       RET
+
+clear65to128:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       VST     V1, 32(R4)
+       VST     V1, 48(R4)
+       ADD     $-64, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       VST     V1, 32(R4)(R5)
+       VST     V1, 48(R4)(R5)
+       RET
+
+clear129to255:
+       VZERO   V1
+       VST     V1, 0(R4)
+       VST     V1, 16(R4)
+       VST     V1, 32(R4)
+       VST     V1, 48(R4)
+       VST     V1, 64(R4)
+       VST     V1, 80(R4)
+       VST     V1, 96(R4)
+       VST     V1, 112(R4)
+       ADD     $-128, R5
+       VST     V1, 0(R4)(R5)
+       VST     V1, 16(R4)(R5)
+       VST     V1, 32(R4)(R5)
+       VST     V1, 48(R4)(R5)
+       VST     V1, 64(R4)(R5)
+       VST     V1, 80(R4)(R5)
+       VST     V1, 96(R4)(R5)
+       VST     V1, 112(R4)(R5)
+       RET
+
+done:
         RET
author	kmvijay <kiran.m.vijay@ibm.com>
	Thu, 3 Apr 2025 05:58:30 +0000 (05:58 +0000)
committer	Gopher Robot <gobot@golang.org>
	Wed, 14 May 2025 21:33:09 +0000 (14:33 -0700)