]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: memclr perf improvements on ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Wed, 5 Oct 2016 20:12:05 +0000 (15:12 -0500)
committerMichael Munday <munday@ca.ibm.com>
Wed, 5 Oct 2016 22:18:14 +0000 (22:18 +0000)
This updates runtime/memclr_ppc64x.s to improve performance,
by unrolling loops for larger clears.

Fixes #17348

benchmark                    old MB/s     new MB/s     speedup
BenchmarkMemclr/5-80         199.71       406.63       2.04x
BenchmarkMemclr/16-80        693.66       1817.41      2.62x
BenchmarkMemclr/64-80        2309.35      5793.34      2.51x
BenchmarkMemclr/256-80       5428.18      14765.81     2.72x
BenchmarkMemclr/4096-80      8611.65      27191.94     3.16x
BenchmarkMemclr/65536-80     8736.69      28604.23     3.27x
BenchmarkMemclr/1M-80        9304.94      27600.09     2.97x
BenchmarkMemclr/4M-80        8705.66      27589.64     3.17x
BenchmarkMemclr/8M-80        8575.74      23631.04     2.76x
BenchmarkMemclr/16M-80       8443.10      19240.68     2.28x
BenchmarkMemclr/64M-80       8390.40      9493.04      1.13x
BenchmarkGoMemclr/5-80       263.05       630.37       2.40x
BenchmarkGoMemclr/16-80      904.33       1148.49      1.27x
BenchmarkGoMemclr/64-80      2830.20      8756.70      3.09x
BenchmarkGoMemclr/256-80     6064.59      20299.46     3.35x

Change-Id: Ic76c9183c8b4129ba3df512ca8b0fe6bd424e088
Reviewed-on: https://go-review.googlesource.com/30373
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
src/runtime/memclr_ppc64x.s

index 442faa25f267c92c9358ef6e04f785f0e06dd160..f7375dbee6717a02edc21310a185093eb900013d 100644 (file)
@@ -7,25 +7,56 @@
 #include "textflag.h"
 
 // void runtime·memclr(void*, uintptr)
-TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
-       MOVD    ptr+0(FP), R3
-       MOVD    n+8(FP), R4
-       SRADCC  $3, R4, R6      // R6 is the number of words to zero
-       BEQ     bytes
-
-       SUB     $8, R3
-       MOVD    R6, CTR
-       MOVDU   R0, 8(R3)
-       BC      25, 0, -1(PC)   // bdnz+ $-4
-       ADD     $8, R3
-
-bytes:
-       ANDCC   $7, R4, R7      // R7 is the number of bytes to zero
-       BEQ     done
-       SUB     $1, R3
-       MOVD    R7, CTR
-       MOVBU   R0, 1(R3)
-       BC      25, 0, -1(PC)   // bdnz+ $-4
-
-done:
+TEXT runtime·memclr(SB), NOSPLIT|NOFRAME, $0-16
+       MOVD ptr+0(FP), R3
+       MOVD n+8(FP), R4
+
+       // Determine if there are doublewords to clear
+check:
+       ANDCC $7, R4, R5  // R5: leftover bytes to clear
+       SRAD  $3, R4, R6  // R6: double words to clear
+       CMP   R6, $0, CR1 // CR1[EQ] set if no double words
+
+       BC     12, 6, nozerolarge // only single bytes
+       MOVD   R6, CTR            // R6 = number of double words
+       SRADCC $2, R6, R7         // 32 byte chunks?
+       BNE    zero32setup
+
+       // Clear double words
+
+zero8:
+       MOVD R0, 0(R3)    // double word
+       ADD  $8, R3
+       BC   16, 0, zero8 // dec ctr, br zero8 if ctr not 0
+       BR   nozerolarge  // handle remainder
+
+       // Prepare to clear 32 bytes at a time.
+
+zero32setup:
+       DCBTST (R3)    // prepare data cache
+       MOVD   R7, CTR // number of 32 byte chunks
+
+zero32:
+       MOVD    R0, 0(R3)       // clear 4 double words
+       MOVD    R0, 8(R3)
+       MOVD    R0, 16(R3)
+       MOVD    R0, 24(R3)
+       ADD     $32, R3
+       BC      16, 0, zero32   // dec ctr, br zero32 if ctr not 0
+       RLDCLCC $61, R4, $3, R6 // remaining doublewords
+       BEQ     nozerolarge
+       MOVD    R6, CTR         // set up the CTR for doublewords
+       BR      zero8
+
+nozerolarge:
+       CMP R5, $0   // any remaining bytes
+       BC  4, 1, LR // ble lr
+
+zerotail:
+       MOVD R5, CTR // set up to clear tail bytes
+
+zerotailloop:
+       MOVB R0, 0(R3)           // clear single bytes
+       ADD  $1, R3
+       BC   16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
        RET