From d82c294da778a789099f3b52cd9c34ef0d798465 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Tue, 8 Feb 2022 09:09:36 -0600 Subject: [PATCH] runtime: fix 32B backward copy on ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The test to enter the 32b copy loop always fails, and execution falls back to a single 8B/iteration copy loop for copies of more than 7 bytes. Likewise, the 32B loop has SRC/DST args mixed, and fails to truncate DWORDS after completing. Fix these, and unroll the 8B/iteration loop as it will only execute 1-3 times if reached. POWER10 benchmarks: name old speed new speed delta MemmoveOverlap/32 5.28GB/s ± 0% 10.37GB/s ± 0% +96.22% MemmoveOverlap/64 5.97GB/s ± 0% 18.15GB/s ± 0% +203.95% MemmoveOverlap/128 7.67GB/s ± 0% 24.35GB/s ± 0% +217.41% MemmoveOverlap/256 14.1GB/s ± 0% 25.0GB/s ± 0% +77.48% MemmoveOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +118.19% MemmoveOverlap/1024 12.3GB/s ± 0% 36.4GB/s ± 0% +194.75% MemmoveOverlap/2048 13.7GB/s ± 0% 48.8GB/s ± 0% +255.24% MemmoveOverlap/4096 14.1GB/s ± 0% 43.4GB/s ± 0% +208.80% MemmoveUnalignedDstOverlap/32 5.07GB/s ± 0% 3.78GB/s ± 0% -25.33% MemmoveUnalignedDstOverlap/64 6.00GB/s ± 0% 9.59GB/s ± 0% +59.78% MemmoveUnalignedDstOverlap/128 7.66GB/s ± 0% 13.51GB/s ± 0% +76.42% MemmoveUnalignedDstOverlap/256 13.4GB/s ± 0% 24.3GB/s ± 0% +80.92% MemmoveUnalignedDstOverlap/512 13.9GB/s ± 0% 30.3GB/s ± 0% +118.29% MemmoveUnalignedDstOverlap/1024 12.3GB/s ± 0% 37.3GB/s ± 0% +203.07% MemmoveUnalignedDstOverlap/2048 13.7GB/s ± 0% 45.9GB/s ± 0% +235.39% MemmoveUnalignedDstOverlap/4096 13.9GB/s ± 0% 41.2GB/s ± 0% +196.34% MemmoveUnalignedSrcOverlap/32 5.13GB/s ± 0% 5.18GB/s ± 0% +0.98% MemmoveUnalignedSrcOverlap/64 6.26GB/s ± 0% 9.53GB/s ± 0% +52.29% MemmoveUnalignedSrcOverlap/128 7.94GB/s ± 0% 18.40GB/s ± 0% +131.76% MemmoveUnalignedSrcOverlap/256 14.1GB/s ± 0% 25.5GB/s ± 0% +81.40% MemmoveUnalignedSrcOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +116.76% MemmoveUnalignedSrcOverlap/1024 12.4GB/s ± 0% 46.4GB/s ± 0% +275.22% MemmoveUnalignedSrcOverlap/2048 13.7GB/s ± 0% 48.7GB/s ± 0% +255.16% MemmoveUnalignedSrcOverlap/4096 14.0GB/s ± 0% 43.2GB/s ± 0% +208.89% Change-Id: I9fc6956ff454a2856d56077d1014388fb74c1f52 Reviewed-on: https://go-review.googlesource.com/c/go/+/384074 Trust: Paul Murphy Run-TryBot: Paul Murphy Reviewed-by: Lynn Boger Reviewed-by: Cherry Mui TryBot-Result: Gopher Robot --- src/runtime/memmove_ppc64x.s | 46 +++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index e69e71a4a1..2152fb4f69 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -139,36 +139,38 @@ backwardtailloop: BC 16, 0, backwardtailloop // bndz nobackwardtail: - BC 4, 5, LR // ble CR1 lr + BC 4, 5, LR // blelr cr1, return if DWORDS == 0 + SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0 + BNE backward32setup // If QWORDS != 0, start the 32B copy loop. -backwardlarge: - MOVD DWORDS, CTR - SUB TGT, SRC, TMP // Use vsx if moving - CMP TMP, $32 // at least 32 byte chunks - BLT backwardlargeloop // and distance >= 32 - SRDCC $2,DWORDS,QWORDS // 32 byte chunks - BNE backward32setup +backward24: + // DWORDS is a value between 1-3. + CMP DWORDS, $2 -backwardlargeloop: MOVD -8(SRC), TMP - SUB $8,SRC MOVD TMP, -8(TGT) - SUB $8,TGT - BC 16, 0, backwardlargeloop // bndz + BC 12, 0, LR // bltlr, return if DWORDS == 1 + + MOVD -16(SRC), TMP + MOVD TMP, -16(TGT) + BC 12, 2, LR // beqlr, return if DWORDS == 2 + + MOVD -24(SRC), TMP + MOVD TMP, -24(TGT) RET backward32setup: - MOVD QWORDS, CTR // set up loop ctr - MOVD $16, IDX16 // 32 bytes at a time + ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0 + MOVD QWORDS, CTR // set up loop ctr + MOVD $16, IDX16 // 32 bytes at a time backward32loop: SUB $32, TGT SUB $32, SRC - LXVD2X (R0)(TGT), VS32 // load 16 bytes - LXVD2X (IDX16)(TGT), VS33 - STXVD2X VS32, (R0)(SRC) // store 16 bytes - STXVD2X VS33, (IDX16)(SRC) - BC 16, 0, backward32loop // bndz - BC 4, 5, LR // ble CR1 lr - MOVD DWORDS, CTR - BR backwardlargeloop + LXVD2X (R0)(SRC), VS32 // load 16x2 bytes + LXVD2X (IDX16)(SRC), VS33 + STXVD2X VS32, (R0)(TGT) // store 16x2 bytes + STXVD2X VS33, (IDX16)(TGT) + BC 16, 0, backward32loop // bndz + BC 12, 2, LR // beqlr, return if DWORDS == 0 + BR backward24 -- 2.50.0