From 9ccf5b8e86ce98494a2127196fbc47d72b0a71a5 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Fri, 8 Apr 2022 13:50:00 -0500 Subject: [PATCH] runtime: improve memmove for ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This improves performance of memmove for larger moves by unrolling the main loop from 32 byte to 64 byte moves. The improvement of the relevant sizes on a power9: Memmove/64 5.11ns ± 0% 5.00ns ± 0% -2.21% Memmove/128 8.26ns ± 0% 5.88ns ± 0% -28.83% Memmove/256 12.7ns ± 0% 8.6ns ± 0% -31.94% Memmove/512 17.9ns ± 0% 14.3ns ± 0% -19.87% Memmove/1024 33.3ns ± 0% 27.0ns ± 0% -18.92% Memmove/2048 72.1ns ± 0% 51.8ns ± 0% -28.25% Memmove/4096 126ns ± 0% 110ns ± 0% -12.63% Change-Id: I74162a9f152d7752a8281da1b89a66da99a3fdc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/399499 Run-TryBot: Lynn Boger TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui Reviewed-by: Ian Lance Taylor --- src/runtime/memmove_ppc64x.s | 56 +++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index 25101a28c7..5fa51c0a4c 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -24,8 +24,12 @@ #define IDX16 R8 // temp used for copies, etc. #define TMP R9 -// number of 32 byte chunks +// number of 64 byte chunks #define QWORDS R10 +// index values +#define IDX32 R14 +#define IDX48 R15 +#define OCTWORDS R16 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 // R3 = TGT = to @@ -52,28 +56,46 @@ check: // Copying forward if no overlap. BC 12, 6, checkbytes // BEQ CR1, checkbytes - SRDCC $2, DWORDS, QWORDS // 32 byte chunks? - BEQ lt32gt8 // < 32 bytes + SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks? + MOVD $16, IDX16 + BEQ lt64gt8 // < 64 bytes - // Prepare for moves of 32 bytes at a time. + // Prepare for moves of 64 bytes at a time. -forward32setup: +forward64setup: DCBTST (TGT) // prepare data cache DCBT (SRC) - MOVD QWORDS, CTR // Number of 32 byte chunks - MOVD $16, IDX16 // 16 for index + MOVD OCTWORDS, CTR // Number of 64 byte chunks + MOVD $32, IDX32 + MOVD $48, IDX48 + PCALIGN $32 -forward32: - LXVD2X (R0)(SRC), VS32 // load 16 bytes - LXVD2X (IDX16)(SRC), VS33 // load 16 bytes - ADD $32, SRC - STXVD2X VS32, (R0)(TGT) // store 16 bytes +forward64: + LXVD2X (R0)(SRC), VS32 // load 64 bytes + LXVD2X (IDX16)(SRC), VS33 + LXVD2X (IDX32)(SRC), VS34 + LXVD2X (IDX48)(SRC), VS35 + ADD $64, SRC + STXVD2X VS32, (R0)(TGT) // store 64 bytes STXVD2X VS33, (IDX16)(TGT) - ADD $32,TGT // bump up for next set - BC 16, 0, forward32 // continue - ANDCC $3, DWORDS // remaining doublewords + STXVD2X VS34, (IDX32)(TGT) + STXVD2X VS35, (IDX48)(TGT) + ADD $64,TGT // bump up for next set + BC 16, 0, forward64 // continue + ANDCC $7, DWORDS // remaining doublewords BEQ checkbytes // only bytes remain +lt64gt8: + CMP DWORDS, $4 + BLT lt32gt8 + LXVD2X (R0)(SRC), VS32 + LXVD2X (IDX16)(SRC), VS33 + ADD $-4, DWORDS + STXVD2X VS32, (R0)(TGT) + STXVD2X VS33, (IDX16)(TGT) + ADD $32, SRC + ADD $32, TGT + lt32gt8: // At this point >= 8 and < 32 // Move 16 bytes if possible @@ -134,7 +156,7 @@ backwardtailloop: SUB $1,SRC MOVBZ TMP, -1(TGT) SUB $1,TGT - BC 16, 0, backwardtailloop // bndz + BDNZ backwardtailloop nobackwardtail: BC 4, 5, LR // blelr cr1, return if DWORDS == 0 @@ -169,6 +191,6 @@ backward32loop: LXVD2X (IDX16)(SRC), VS33 STXVD2X VS32, (R0)(TGT) // store 16x2 bytes STXVD2X VS33, (IDX16)(TGT) - BC 16, 0, backward32loop // bndz + BDNZ backward32loop BC 12, 2, LR // beqlr, return if DWORDS == 0 BR backward24 -- 2.50.0