// See memmove Go doc for important implementation constraints.
// func memmove(to, from unsafe.Pointer, n uintptr)
+
+// target address
+#define TGT R3
+// source address
+#define SRC R4
+// length to move
+#define LEN R5
+// number of doublewords
+#define DWORDS R6
+// number of bytes < 8
+#define BYTES R7
+// const 16 used as index
+#define IDX16 R8
+// temp used for copies, etc.
+#define TMP R9
+// number of 32 byte chunks
+#define QWORDS R10
+
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
- MOVD to+0(FP), R3
- MOVD from+8(FP), R4
- MOVD n+16(FP), R5
+ MOVD to+0(FP), TGT
+ MOVD from+8(FP), SRC
+ MOVD n+16(FP), LEN
// Determine if there are doublewords to
// copy so a more efficient move can be done
check:
- ANDCC $7, R5, R7 // R7: bytes to copy
- SRD $3, R5, R6 // R6: double words to copy
- CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
+ ANDCC $7, LEN, BYTES // R7: bytes to copy
+ SRD $3, LEN, DWORDS // R6: double words to copy
+ MOVFL CR0, CR3 // save CR from ANDCC
+ CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
// Determine overlap by subtracting dest - src and comparing against the
- // length. The catches the cases where src and dest are in different types
+ // length. This catches the cases where src and dest are in different types
// of storage such as stack and static to avoid doing backward move when not
// necessary.
- SUB R4, R3, R8 // dest - src
- CMPU R8, R5, CR2 // < len?
+ SUB SRC, TGT, TMP // dest - src
+ CMPU TMP, LEN, CR2 // < len?
BC 12, 8, backward // BLT CR2 backward
// Copying forward if no overlap.
- BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
- SRDCC $2,R6,R8 // 32 byte chunks?
- BNE forward32setup //
- MOVD R6,CTR // R6 = number of double words
-
- // Move double words
-
-forward8:
- MOVD 0(R4), R8 // double word
- ADD $8,R4
- MOVD R8, 0(R3) //
- ADD $8,R3
- BC 16, 0, forward8
- BR noforwardlarge // handle remainder
+ BC 12, 6, checkbytes // BEQ CR1, checkbytes
+ SRDCC $2, DWORDS, QWORDS // 32 byte chunks?
+ BEQ lt32gt8 // < 32 bytes
// Prepare for moves of 32 bytes at a time.
forward32setup:
- DCBTST (R3) // prepare data cache
- DCBT (R4)
- MOVD R8, CTR // double work count
- MOVD $16, R8
+ DCBTST (TGT) // prepare data cache
+ DCBT (SRC)
+ MOVD QWORDS, CTR // Number of 32 byte chunks
+ MOVD $16, IDX16 // 16 for index
forward32:
- LXVD2X (R4+R0), VS32 // load 16 bytes
- LXVD2X (R4+R8), VS33
- ADD $32, R4
- STXVD2X VS32, (R3+R0) // store 16 bytes
- STXVD2X VS33, (R3+R8)
- ADD $32,R3 // bump up for next set
+ LXVD2X (R0)(SRC), VS32 // load 16 bytes
+ LXVD2X (IDX16)(SRC), VS33 // load 16 bytes
+ ADD $32, SRC
+ STXVD2X VS32, (R0)(TGT) // store 16 bytes
+ STXVD2X VS33, (IDX16)(TGT)
+ ADD $32,TGT // bump up for next set
BC 16, 0, forward32 // continue
- RLDCLCC $61,R5,$3,R6 // remaining doublewords
- BEQ noforwardlarge
- MOVD R6,CTR // set up the CTR
- BR forward8
-
-noforwardlarge:
- CMP R7,$0 // any remaining bytes
- BC 4, 1, LR // ble lr
-
-forwardtail:
- MOVD R7, CTR // move tail bytes
-
-forwardtailloop:
- MOVBZ 0(R4), R8 // move single bytes
- ADD $1,R4
- MOVBZ R8, 0(R3)
- ADD $1,R3
- BC 16, 0, forwardtailloop
+ ANDCC $3, DWORDS // remaining doublewords
+ BEQ checkbytes // only bytes remain
+
+lt32gt8:
+ // At this point >= 8 and < 32
+ // Move 16 bytes if possible
+ CMP DWORDS, $2
+ BLT lt16
+ LXVD2X (R0)(SRC), VS32
+ ADD $-2, DWORDS
+ STXVD2X VS32, (R0)(TGT)
+ ADD $16, SRC
+ ADD $16, TGT
+
+lt16: // Move 8 bytes if possible
+ CMP DWORDS, $1
+ BLT checkbytes
+ MOVD 0(SRC), TMP
+ ADD $8, SRC
+ MOVD TMP, 0(TGT)
+ ADD $8, TGT
+checkbytes:
+ BC 12, 14, LR // BEQ lr
+lt8: // Move word if possible
+ CMP BYTES, $4
+ BLT lt4
+ MOVWZ 0(SRC), TMP
+ ADD $-4, BYTES
+ MOVW TMP, 0(TGT)
+ ADD $4, SRC
+ ADD $4, TGT
+lt4: // Move halfword if possible
+ CMP BYTES, $2
+ BLT lt2
+ MOVHZ 0(SRC), TMP
+ ADD $-2, BYTES
+ MOVH TMP, 0(TGT)
+ ADD $2, SRC
+ ADD $2, TGT
+lt2: // Move last byte if 1 left
+ CMP BYTES, $1
+ BC 12, 0, LR // ble lr
+ MOVBZ 0(SRC), TMP
+ MOVBZ TMP, 0(TGT)
RET
backward:
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
- ADD R5, R4, R4 // end of source
- ADD R3, R5, R3 // end of dest
+ ADD LEN, SRC, SRC // end of source
+ ADD TGT, LEN, TGT // end of dest
BEQ nobackwardtail // earlier condition
- MOVD R7, CTR // bytes to move
+ MOVD BYTES, CTR // bytes to move
backwardtailloop:
- MOVBZ -1(R4), R8 // point to last byte
- SUB $1,R4
- MOVBZ R8, -1(R3)
- SUB $1,R3
+ MOVBZ -1(SRC), TMP // point to last byte
+ SUB $1,SRC
+ MOVBZ TMP, -1(TGT)
+ SUB $1,TGT
BC 16, 0, backwardtailloop // bndz
nobackwardtail:
BC 4, 5, LR // ble CR1 lr
backwardlarge:
- MOVD R6, CTR
- SUB R3, R4, R9 // Use vsx if moving
- CMP R9, $32 // at least 32 byte chunks
+ MOVD DWORDS, CTR
+ SUB TGT, SRC, TMP // Use vsx if moving
+ CMP TMP, $32 // at least 32 byte chunks
BLT backwardlargeloop // and distance >= 32
- SRDCC $2,R6,R8 // 32 byte chunks
+ SRDCC $2,DWORDS,QWORDS // 32 byte chunks
BNE backward32setup
backwardlargeloop:
- MOVD -8(R4), R8
- SUB $8,R4
- MOVD R8, -8(R3)
- SUB $8,R3
+ MOVD -8(SRC), TMP
+ SUB $8,SRC
+ MOVD TMP, -8(TGT)
+ SUB $8,TGT
BC 16, 0, backwardlargeloop // bndz
RET
backward32setup:
- MOVD R8, CTR // set up loop ctr
- MOVD $16, R8 // 32 bytes at at time
+ MOVD QWORDS, CTR // set up loop ctr
+ MOVD $16, IDX16 // 32 bytes at at time
backward32loop:
- SUB $32, R4
- SUB $32, R3
- LXVD2X (R4+R0), VS32 // load 16 bytes
- LXVD2X (R4+R8), VS33
- STXVD2X VS32, (R3+R0) // store 16 bytes
- STXVD2X VS33, (R3+R8)
+ SUB $32, TGT
+ SUB $32, SRC
+ LXVD2X (R0)(TGT), VS32 // load 16 bytes
+ LXVD2X (IDX16)(TGT), VS33
+ STXVD2X VS32, (R0)(SRC) // store 16 bytes
+ STXVD2X VS33, (IDX16)(SRC)
BC 16, 0, backward32loop // bndz
BC 4, 5, LR // ble CR1 lr
- MOVD R6, CTR
+ MOVD DWORDS, CTR
BR backwardlargeloop