]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: improve memmove performance ppc64,ppc64le
authorLynn Boger <laboger@linux.vnet.ibm.com>
Wed, 13 Apr 2016 13:58:10 +0000 (08:58 -0500)
committerBrad Fitzpatrick <bradfitz@golang.org>
Wed, 13 Apr 2016 15:27:59 +0000 (15:27 +0000)
This change improves the performance of memmove
on ppc64 & ppc64le mainly for moves >=32 bytes.
In addition, the test to detect backward moves
 was enhanced to avoid backward moves if source
and dest were in different types of storage, since
backward moves might not always be efficient.

Fixes #14507

The following shows some of the improvements from the test
in the runtime package:

BenchmarkMemmove32                   4229.56      4717.13      1.12x
BenchmarkMemmove64                   6156.03      7810.42      1.27x
BenchmarkMemmove128                  7521.69      12468.54     1.66x
BenchmarkMemmove256                  6729.90      18260.33     2.71x
BenchmarkMemmove512                  8521.59      18033.81     2.12x
BenchmarkMemmove1024                 9760.92      25762.61     2.64x
BenchmarkMemmove2048                 10241.00     29584.94     2.89x
BenchmarkMemmove4096                 10399.37     31882.31     3.07x

BenchmarkMemmoveUnalignedDst16       1943.69      2258.33      1.16x
BenchmarkMemmoveUnalignedDst32       3885.08      3965.81      1.02x
BenchmarkMemmoveUnalignedDst64       5121.63      6965.54      1.36x
BenchmarkMemmoveUnalignedDst128      7212.34      11372.68     1.58x
BenchmarkMemmoveUnalignedDst256      6564.52      16913.59     2.58x
BenchmarkMemmoveUnalignedDst512      8364.35      17782.57     2.13x
BenchmarkMemmoveUnalignedDst1024     9539.87      24914.72     2.61x
BenchmarkMemmoveUnalignedDst2048     9199.23      21235.11     2.31x
BenchmarkMemmoveUnalignedDst4096     10077.39     25231.99     2.50x

BenchmarkMemmoveUnalignedSrc32       3249.83      3742.52      1.15x
BenchmarkMemmoveUnalignedSrc64       5562.35      6627.96      1.19x
BenchmarkMemmoveUnalignedSrc128      6023.98      10200.84     1.69x
BenchmarkMemmoveUnalignedSrc256      6921.83      15258.43     2.20x
BenchmarkMemmoveUnalignedSrc512      8593.13      16541.97     1.93x
BenchmarkMemmoveUnalignedSrc1024     9730.95      22927.84     2.36x
BenchmarkMemmoveUnalignedSrc2048     9793.28      21537.73     2.20x
BenchmarkMemmoveUnalignedSrc4096     10132.96     26295.06     2.60x

Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2
Reviewed-on: https://go-review.googlesource.com/21990
Reviewed-by: Bill O'Farrell <billotosyr@gmail.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
src/runtime/memmove_ppc64x.s

index ea73b455b478515f45262f37e30254d6020e5577..26dabd9e6947497cfabcf1dd437c3c78043379ef 100644 (file)
@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
        MOVD    to+0(FP), R3
        MOVD    from+8(FP), R4
        MOVD    n+16(FP), R5
-       CMP     R5, $0
-       BNE     check
-       RET
 
+       // Determine if there are doublewords to
+       // copy so a more efficient move can be done
 check:
-       ANDCC   $7, R5, R7      // R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
-       SRAD    $3, R5, R6      // R6 is the number of words to copy
-       CMP     R6, $0, CR1     // CR1[EQ] is set if there are no words to copy.
-
-       CMP     R3, R4, CR2
-       BC      12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
+       ANDCC   $7, R5, R7      // R7: bytes to copy
+       SRAD    $3, R5, R6      // R6: double words to copy
+       CMP     R6, $0, CR1     // CR1[EQ] set if no double words to copy
 
-       // Copying forward proceeds by copying R6 words then copying R7 bytes.
-       // R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
-       // load/store, R3 and R4 point before the bytes that are to be copied.
+       // Determine overlap by subtracting dest - src and comparing against the
+       // length.  The catches the cases where src and dest are in different types
+       // of storage such as stack and static to avoid doing backward move when not
+       // necessary.
 
-       BC      12, 6, noforwardlarge   // "BEQ CR1, noforwardlarge"
-
-       MOVD    R6, CTR
+       SUB     R4, R3, R8      // dest - src
+       CMPU    R8, R5, CR2     // < len?
+       BC      12, 8, backward // BLT CR2 backward
 
-       SUB     $8, R3
-       SUB     $8, R4
+       // Copying forward if no overlap.
 
-forwardlargeloop:
-       MOVDU   8(R4), R8
-       MOVDU   R8, 8(R3)
-       BC      16, 0, forwardlargeloop // "BDNZ"
-
-       ADD     $8, R3
-       ADD     $8, R4
+       BC      12, 6, noforwardlarge   // "BEQ CR1, noforwardlarge"
+       MOVD    R6,CTR                  // R6 = number of double words
+       SRADCC  $2,R6,R8                // 32 byte chunks?
+       BNE     forward32setup          //
+
+       // Move double words
+
+forward8:
+       MOVD    0(R4), R8               // double word
+       ADD     $8,R4
+       MOVD    R8, 0(R3)               //
+       ADD     $8,R3
+       BC      16, 0, forward8
+       BR      noforwardlarge          // handle remainder
+
+       // Prepare for moves of 32 bytes at a time.
+
+forward32setup:
+       DCBTST  (R3)                    // prepare data cache
+       DCBT    (R4)
+       MOVD    R8, CTR                 // double work count
+
+forward32:
+       MOVD    0(R4), R8               // load 4 double words
+       MOVD    8(R4), R9
+       MOVD    16(R4), R14
+       MOVD    24(R4), R15
+       ADD     $32,R4
+       MOVD    R8, 0(R3)               // store those 4
+       MOVD    R9, 8(R3)
+       MOVD    R14,16(R3)
+       MOVD    R15,24(R3)
+       ADD     $32,R3                  // bump up for next set
+       BC      16, 0, forward32        // continue
+       RLDCLCC $61,R5,$3,R6            // remaining doublewords
+       BEQ     noforwardlarge
+       MOVD    R6,CTR                  // set up the CTR
+       BR      forward8
 
 noforwardlarge:
-       BNE     forwardtail     // Tests the bit set by ANDCC above
-       RET
+       CMP     R7,$0                   // any remaining bytes
+       BC      4, 1, LR
 
 forwardtail:
-       SUB     $1, R3
-       SUB     $1, R4
-       MOVD    R7, CTR
+       MOVD    R7, CTR                 // move tail bytes
 
 forwardtailloop:
-       MOVBZU  1(R4), R8
-       MOVBZU  R8, 1(R3)
+       MOVBZ   0(R4), R8               // move single bytes
+       ADD     $1,R4
+       MOVBZ   R8, 0(R3)
+       ADD     $1,R3
        BC      16, 0, forwardtailloop
        RET
 
 backward:
-       // Copying backwards proceeds by copying R7 bytes then copying R6 words.
+       // Copying backwards proceeds by copying R7 bytes then copying R6 double words.
        // R3 and R4 are advanced to the end of the destination/source buffers
        // respectively and moved back as we copy.
 
-       ADD     R5, R4, R4
-       ADD     R3, R5, R3
+       ADD     R5, R4, R4              // end of source
+       ADD     R3, R5, R3              // end of dest
 
-       BEQ     nobackwardtail
+       BEQ     nobackwardtail          // earlier condition
 
-       MOVD    R7, CTR
+       MOVD    R7, CTR                 // bytes to move
 
 backwardtailloop:
-       MOVBZU  -1(R4), R8
-       MOVBZU  R8, -1(R3)
+       MOVBZ   -1(R4), R8              // point to last byte
+       SUB     $1,R4
+       MOVBZ   R8, -1(R3)
+       SUB     $1,R3
        BC      16, 0, backwardtailloop
 
 nobackwardtail:
-       BC      4, 6, backwardlarge             // "BNE CR1"
-       RET
+       CMP     R6,$0
+       BC      4, 5, LR
 
 backwardlarge:
        MOVD    R6, CTR
 
 backwardlargeloop:
-       MOVDU   -8(R4), R8
-       MOVDU   R8, -8(R3)
-       BC      16, 0, backwardlargeloop        // "BDNZ"
+       MOVD    -8(R4), R8
+       SUB     $8,R4
+       MOVD    R8, -8(R3)
+       SUB     $8,R3
+       BC      16, 0, backwardlargeloop        //
        RET