]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: fix 32B backward copy on ppc64x
authorPaul E. Murphy <murp@ibm.com>
Tue, 8 Feb 2022 15:09:36 +0000 (09:09 -0600)
committerPaul Murphy <murp@ibm.com>
Thu, 3 Mar 2022 14:39:51 +0000 (14:39 +0000)
The test to enter the 32b copy loop always fails, and execution
falls back to a single 8B/iteration copy loop for copies of more
than 7 bytes. Likewise, the 32B loop has SRC/DST args mixed,
and fails to truncate DWORDS after completing.

Fix these, and unroll the 8B/iteration loop as it will only
execute 1-3 times if reached.

POWER10 benchmarks:

name                             old speed      new speed       delta
MemmoveOverlap/32                5.28GB/s ± 0%  10.37GB/s ± 0%   +96.22%
MemmoveOverlap/64                5.97GB/s ± 0%  18.15GB/s ± 0%  +203.95%
MemmoveOverlap/128               7.67GB/s ± 0%  24.35GB/s ± 0%  +217.41%
MemmoveOverlap/256               14.1GB/s ± 0%   25.0GB/s ± 0%   +77.48%
MemmoveOverlap/512               14.2GB/s ± 0%   30.9GB/s ± 0%  +118.19%
MemmoveOverlap/1024              12.3GB/s ± 0%   36.4GB/s ± 0%  +194.75%
MemmoveOverlap/2048              13.7GB/s ± 0%   48.8GB/s ± 0%  +255.24%
MemmoveOverlap/4096              14.1GB/s ± 0%   43.4GB/s ± 0%  +208.80%
MemmoveUnalignedDstOverlap/32    5.07GB/s ± 0%   3.78GB/s ± 0%   -25.33%
MemmoveUnalignedDstOverlap/64    6.00GB/s ± 0%   9.59GB/s ± 0%   +59.78%
MemmoveUnalignedDstOverlap/128   7.66GB/s ± 0%  13.51GB/s ± 0%   +76.42%
MemmoveUnalignedDstOverlap/256   13.4GB/s ± 0%   24.3GB/s ± 0%   +80.92%
MemmoveUnalignedDstOverlap/512   13.9GB/s ± 0%   30.3GB/s ± 0%  +118.29%
MemmoveUnalignedDstOverlap/1024  12.3GB/s ± 0%   37.3GB/s ± 0%  +203.07%
MemmoveUnalignedDstOverlap/2048  13.7GB/s ± 0%   45.9GB/s ± 0%  +235.39%
MemmoveUnalignedDstOverlap/4096  13.9GB/s ± 0%   41.2GB/s ± 0%  +196.34%
MemmoveUnalignedSrcOverlap/32    5.13GB/s ± 0%   5.18GB/s ± 0%    +0.98%
MemmoveUnalignedSrcOverlap/64    6.26GB/s ± 0%   9.53GB/s ± 0%   +52.29%
MemmoveUnalignedSrcOverlap/128   7.94GB/s ± 0%  18.40GB/s ± 0%  +131.76%
MemmoveUnalignedSrcOverlap/256   14.1GB/s ± 0%   25.5GB/s ± 0%   +81.40%
MemmoveUnalignedSrcOverlap/512   14.2GB/s ± 0%   30.9GB/s ± 0%  +116.76%
MemmoveUnalignedSrcOverlap/1024  12.4GB/s ± 0%   46.4GB/s ± 0%  +275.22%
MemmoveUnalignedSrcOverlap/2048  13.7GB/s ± 0%   48.7GB/s ± 0%  +255.16%
MemmoveUnalignedSrcOverlap/4096  14.0GB/s ± 0%   43.2GB/s ± 0%  +208.89%

Change-Id: I9fc6956ff454a2856d56077d1014388fb74c1f52
Reviewed-on: https://go-review.googlesource.com/c/go/+/384074
Trust: Paul Murphy <murp@ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/runtime/memmove_ppc64x.s

index e69e71a4a1872427a870ae3eae0fd728c55a9ee2..2152fb4f69fedf2381a7abbef688119b792cf71e 100644 (file)
@@ -139,36 +139,38 @@ backwardtailloop:
        BC      16, 0, backwardtailloop // bndz
 
 nobackwardtail:
-       BC      4, 5, LR                // ble CR1 lr
+       BC      4, 5, LR                // blelr cr1, return if DWORDS == 0
+       SRDCC   $2,DWORDS,QWORDS        // Compute number of 32B blocks and compare to 0
+       BNE     backward32setup         // If QWORDS != 0, start the 32B copy loop.
 
-backwardlarge:
-       MOVD    DWORDS, CTR
-       SUB     TGT, SRC, TMP           // Use vsx if moving
-       CMP     TMP, $32                // at least 32 byte chunks
-       BLT     backwardlargeloop       // and distance >= 32
-       SRDCC   $2,DWORDS,QWORDS        // 32 byte chunks
-       BNE     backward32setup
+backward24:
+       // DWORDS is a value between 1-3.
+       CMP     DWORDS, $2
 
-backwardlargeloop:
        MOVD    -8(SRC), TMP
-       SUB     $8,SRC
        MOVD    TMP, -8(TGT)
-       SUB     $8,TGT
-       BC      16, 0, backwardlargeloop // bndz
+       BC      12, 0, LR               // bltlr, return if DWORDS == 1
+
+       MOVD    -16(SRC), TMP
+       MOVD    TMP, -16(TGT)
+       BC      12, 2, LR               // beqlr, return if DWORDS == 2
+
+       MOVD    -24(SRC), TMP
+       MOVD    TMP, -24(TGT)
        RET
 
 backward32setup:
-       MOVD    QWORDS, CTR                     // set up loop ctr
-       MOVD    $16, IDX16                      // 32 bytes at a time
+       ANDCC   $3,DWORDS               // Compute remaining DWORDS and compare to 0
+       MOVD    QWORDS, CTR             // set up loop ctr
+       MOVD    $16, IDX16              // 32 bytes at a time
 
 backward32loop:
        SUB     $32, TGT
        SUB     $32, SRC
-       LXVD2X  (R0)(TGT), VS32           // load 16 bytes
-       LXVD2X  (IDX16)(TGT), VS33
-       STXVD2X VS32, (R0)(SRC)           // store 16 bytes
-       STXVD2X VS33, (IDX16)(SRC)
-       BC      16, 0, backward32loop   // bndz
-       BC      4, 5, LR                // ble CR1 lr
-       MOVD    DWORDS, CTR
-       BR      backwardlargeloop
+       LXVD2X  (R0)(SRC), VS32         // load 16x2 bytes
+       LXVD2X  (IDX16)(SRC), VS33
+       STXVD2X VS32, (R0)(TGT)         // store 16x2 bytes
+       STXVD2X VS33, (IDX16)(TGT)
+       BC      16, 0, backward32loop   // bndz
+       BC      12, 2, LR               // beqlr, return if DWORDS == 0
+       BR      backward24