]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: improve memmove on ppc64x/power10
authorArchana R <aravind5@in.ibm.com>
Wed, 22 Feb 2023 11:52:15 +0000 (05:52 -0600)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Wed, 22 Mar 2023 11:36:20 +0000 (11:36 +0000)
Rewrite memmove asm function to use the new power10 instructions
lxvl and stxvl or the load and store vector with length which can
specify the number of bytes to be loaded/stored in a register,
thereby avoiding multiple instructions to process 8bytes, 4bytes,
2bytes and a single byte while storing the tail end bytes. On power9
and power8 the code remains unchanged.
The performance for all sizes<=16 improve on power10 with this change.

name                          old time/op    new time/op    delta
Memmove/1                2.87ns ±  0%    2.64ns ±  1%    -8.11%
Memmove/2                2.85ns ±  0%    2.62ns ±  1%    -8.12%
Memmove/3                2.78ns ±  0%    2.63ns ±  1%    -5.33%
Memmove/4                2.83ns ±  0%    2.63ns ±  2%    -7.33%
Memmove/5                2.78ns ±  0%    2.63ns ±  1%    -5.40%
Memmove/6                2.61ns ±  3%    2.61ns ±  1%      ~
Memmove/7                2.82ns ±  0%    2.61ns ±  1%    -7.48%
Memmove/8                2.82ns ±  0%    2.65ns ±  1%    -6.11%
Memmove/9                6.41ns ±  0%    2.62ns ±  1%   -59.17%
Memmove/10               5.09ns ±  1%    2.60ns ±  1%   -48.90%
Memmove/11               4.68ns ±  7%    2.59ns ±  1%   -44.56%
Memmove/12               6.25ns ±  2%    2.60ns ±  1%   -58.46%
Memmove/13               4.15ns ± 25%    2.59ns ±  1%   -37.66%
Memmove/14               3.76ns ± 11%    2.59ns ±  1%   -30.94%
Memmove/15               3.82ns ±  1%    2.60ns ±  1%   -31.93%
Memmove/16               2.96ns ±  1%    2.59ns ±  1%   -12.63%
MemmoveUnalignedDst/1    3.07ns ±  0%    2.77ns ±  0%    -9.75%
MemmoveUnalignedDst/2    2.82ns ±  0%    2.77ns ±  0%    -1.73%
MemmoveUnalignedDst/3    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/4    2.85ns ±  1%    2.77ns ±  0%    -2.90%
MemmoveUnalignedDst/5    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/6    2.88ns ±  0%    2.77ns ±  0%    -4.04%
MemmoveUnalignedDst/7    3.11ns ±  0%    2.77ns ±  0%   -11.10%
MemmoveUnalignedDst/8    4.18ns ±  2%    2.77ns ±  0%   -33.90%
MemmoveUnalignedDst/9    6.36ns ±  1%    2.77ns ±  0%   -56.53%
MemmoveUnalignedDst/10   5.77ns ±  1%    2.77ns ±  0%   -52.09%
MemmoveUnalignedDst/11   4.68ns ±  1%    2.77ns ±  0%   -40.86%
MemmoveUnalignedDst/12   4.54ns ±  2%    2.77ns ±  0%   -39.05%
MemmoveUnalignedDst/13   6.16ns ±  5%    2.77ns ±  0%   -55.14%
MemmoveUnalignedDst/14   4.03ns ±  2%    2.77ns ±  0%   -31.41%
MemmoveUnalignedDst/15   4.11ns ±  0%    2.77ns ±  0%   -32.74%
MemmoveUnalignedDst/16   3.49ns ±  4%    2.79ns ±  1%   -20.04%
MemmoveUnalignedSrc/1    3.06ns ±  0%    2.77ns ±  0%    -9.68%
MemmoveUnalignedSrc/2    2.82ns ±  1%    2.77ns ±  0%    -1.93%
MemmoveUnalignedSrc/3    3.04ns ±  0%    2.77ns ±  0%    -8.95%
MemmoveUnalignedSrc/4    2.85ns ±  0%    2.77ns ±  0%    -2.86%
MemmoveUnalignedSrc/5    3.04ns ±  0%    2.77ns ±  0%    -8.97%
MemmoveUnalignedSrc/6    2.93ns ±  0%    2.77ns ±  0%    -5.43%
MemmoveUnalignedSrc/7    3.13ns ±  0%    2.77ns ±  0%   -11.56%
MemmoveUnalignedSrc/8    3.71ns ±  2%    2.77ns ±  0%   -25.46%
MemmoveUnalignedSrc/9    6.04ns ±  0%    2.77ns ±  0%   -54.16%
MemmoveUnalignedSrc/10   6.86ns ±  5%    2.77ns ±  0%   -59.69%
MemmoveUnalignedSrc/11   4.18ns ±  3%    2.77ns ±  0%   -33.81%
MemmoveUnalignedSrc/12   4.75ns ±  2%    2.77ns ±  0%   -41.81%
MemmoveUnalignedSrc/13   4.78ns ±  3%    2.77ns ±  0%   -42.15%
MemmoveUnalignedSrc/14   3.89ns ±  5%    2.77ns ±  0%   -28.80%
MemmoveUnalignedSrc/15   4.09ns ±  0%    2.77ns ±  0%   -32.30%
MemmoveUnalignedSrc/16   3.15ns ±  1%    2.77ns ±  0%   -12.05%
Change-Id: Ia3c09d968dada71a794e5ccab3300ea9c46d8374
Reviewed-on: https://go-review.googlesource.com/c/go/+/470135
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/runtime/memmove_ppc64x.s

index 5fa51c0a4cf9d673c9f6d8775bec3bf8ec821150..dee59054b9010087b88ed939f5cbe69ca0b1364a 100644 (file)
@@ -39,6 +39,15 @@ TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
        // Determine if there are doublewords to
        // copy so a more efficient move can be done
 check:
+#ifdef GOPPC64_power10
+       CMP     LEN, $16
+       BGT     mcopy
+       SLD     $56, LEN, TMP
+       LXVL    SRC, TMP, V0
+       STXVL   V0, TGT, TMP
+       RET
+#endif
+mcopy:
        ANDCC   $7, LEN, BYTES  // R7: bytes to copy
        SRD     $3, LEN, DWORDS // R6: double words to copy
        MOVFL   CR0, CR3        // save CR from ANDCC
@@ -110,12 +119,26 @@ lt32gt8:
 lt16:  // Move 8 bytes if possible
        CMP     DWORDS, $1
        BLT     checkbytes
+#ifdef GOPPC64_power10
+       ADD     $8, BYTES
+       SLD     $56, BYTES, TMP
+       LXVL    SRC, TMP, V0
+       STXVL   V0, TGT, TMP
+       RET
+#endif
+
        MOVD    0(SRC), TMP
        ADD     $8, SRC
        MOVD    TMP, 0(TGT)
        ADD     $8, TGT
 checkbytes:
        BC      12, 14, LR              // BEQ lr
+#ifdef GOPPC64_power10
+       SLD     $56, BYTES, TMP
+       LXVL    SRC, TMP, V0
+       STXVL   V0, TGT, TMP
+       RET
+#endif
 lt8:   // Move word if possible
        CMP BYTES, $4
        BLT lt4
@@ -183,6 +206,7 @@ backward32setup:
        ANDCC   $3,DWORDS               // Compute remaining DWORDS and compare to 0
        MOVD    QWORDS, CTR             // set up loop ctr
        MOVD    $16, IDX16              // 32 bytes at a time
+       PCALIGN $32
 
 backward32loop:
        SUB     $32, TGT