MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
- CMP R5, $0
- BNE check
- RET
+ // Determine if there are doublewords to
+ // copy so a more efficient move can be done
check:
- ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
- SRAD $3, R5, R6 // R6 is the number of words to copy
- CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy.
-
- CMP R3, R4, CR2
- BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
+ ANDCC $7, R5, R7 // R7: bytes to copy
+ SRAD $3, R5, R6 // R6: double words to copy
+ CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
- // Copying forward proceeds by copying R6 words then copying R7 bytes.
- // R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
- // load/store, R3 and R4 point before the bytes that are to be copied.
+ // Determine overlap by subtracting dest - src and comparing against the
+ // length. The catches the cases where src and dest are in different types
+ // of storage such as stack and static to avoid doing backward move when not
+ // necessary.
- BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
-
- MOVD R6, CTR
+ SUB R4, R3, R8 // dest - src
+ CMPU R8, R5, CR2 // < len?
+ BC 12, 8, backward // BLT CR2 backward
- SUB $8, R3
- SUB $8, R4
+ // Copying forward if no overlap.
-forwardlargeloop:
- MOVDU 8(R4), R8
- MOVDU R8, 8(R3)
- BC 16, 0, forwardlargeloop // "BDNZ"
-
- ADD $8, R3
- ADD $8, R4
+ BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
+ MOVD R6,CTR // R6 = number of double words
+ SRADCC $2,R6,R8 // 32 byte chunks?
+ BNE forward32setup //
+
+ // Move double words
+
+forward8:
+ MOVD 0(R4), R8 // double word
+ ADD $8,R4
+ MOVD R8, 0(R3) //
+ ADD $8,R3
+ BC 16, 0, forward8
+ BR noforwardlarge // handle remainder
+
+ // Prepare for moves of 32 bytes at a time.
+
+forward32setup:
+ DCBTST (R3) // prepare data cache
+ DCBT (R4)
+ MOVD R8, CTR // double work count
+
+forward32:
+ MOVD 0(R4), R8 // load 4 double words
+ MOVD 8(R4), R9
+ MOVD 16(R4), R14
+ MOVD 24(R4), R15
+ ADD $32,R4
+ MOVD R8, 0(R3) // store those 4
+ MOVD R9, 8(R3)
+ MOVD R14,16(R3)
+ MOVD R15,24(R3)
+ ADD $32,R3 // bump up for next set
+ BC 16, 0, forward32 // continue
+ RLDCLCC $61,R5,$3,R6 // remaining doublewords
+ BEQ noforwardlarge
+ MOVD R6,CTR // set up the CTR
+ BR forward8
noforwardlarge:
- BNE forwardtail // Tests the bit set by ANDCC above
- RET
+ CMP R7,$0 // any remaining bytes
+ BC 4, 1, LR
forwardtail:
- SUB $1, R3
- SUB $1, R4
- MOVD R7, CTR
+ MOVD R7, CTR // move tail bytes
forwardtailloop:
- MOVBZU 1(R4), R8
- MOVBZU R8, 1(R3)
+ MOVBZ 0(R4), R8 // move single bytes
+ ADD $1,R4
+ MOVBZ R8, 0(R3)
+ ADD $1,R3
BC 16, 0, forwardtailloop
RET
backward:
- // Copying backwards proceeds by copying R7 bytes then copying R6 words.
+ // Copying backwards proceeds by copying R7 bytes then copying R6 double words.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
- ADD R5, R4, R4
- ADD R3, R5, R3
+ ADD R5, R4, R4 // end of source
+ ADD R3, R5, R3 // end of dest
- BEQ nobackwardtail
+ BEQ nobackwardtail // earlier condition
- MOVD R7, CTR
+ MOVD R7, CTR // bytes to move
backwardtailloop:
- MOVBZU -1(R4), R8
- MOVBZU R8, -1(R3)
+ MOVBZ -1(R4), R8 // point to last byte
+ SUB $1,R4
+ MOVBZ R8, -1(R3)
+ SUB $1,R3
BC 16, 0, backwardtailloop
nobackwardtail:
- BC 4, 6, backwardlarge // "BNE CR1"
- RET
+ CMP R6,$0
+ BC 4, 5, LR
backwardlarge:
MOVD R6, CTR
backwardlargeloop:
- MOVDU -8(R4), R8
- MOVDU R8, -8(R3)
- BC 16, 0, backwardlargeloop // "BDNZ"
+ MOVD -8(R4), R8
+ SUB $8,R4
+ MOVD R8, -8(R3)
+ SUB $8,R3
+ BC 16, 0, backwardlargeloop //
RET