// See memmove Go doc for important implementation constraints.
+// Register map
+//
+// dstin R0
+// src R1
+// count R2
+// dst R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data R6-R17
+// tmp1 R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
- MOVD to+0(FP), R3
- MOVD from+8(FP), R4
- MOVD n+16(FP), R5
- CBNZ R5, check
- RET
+ MOVD to+0(FP), R0
+ MOVD from+8(FP), R1
+ MOVD n+16(FP), R2
+ CBZ R2, copy0
-check:
- CMP $16, R5
+ // Small copies: 1..16 bytes
+ CMP $16, R2
BLE copy16
- AND $~31, R5, R7 // R7 is N&~31
- SUB R7, R5, R6 // R6 is N&31
-
- CMP R3, R4
- BLT backward
-
- // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
- // R3 and R4 are advanced as we copy.
-
- // (There may be implementations of armv8 where copying by bytes until
- // at least one of source or dest is word aligned is a worthwhile
- // optimization, but the on the one tested so far (xgene) it did not
- // make a significance difference.)
-
- CBZ R7, noforwardlarge // Do we need to do any quadword copying?
-
- ADD R3, R7, R9 // R9 points just past where we copy by word
-
-forwardlargeloop:
- // Copy 32 bytes at a time.
- LDP.P 32(R4), (R8, R10)
- STP.P (R8, R10), 32(R3)
- LDP -16(R4), (R11, R12)
- STP (R11, R12), -16(R3)
- SUB $32, R7, R7
- CBNZ R7, forwardlargeloop
-
-noforwardlarge:
- CBNZ R6, forwardtail // Do we need to copy any tail bytes?
+ // Large copies
+ CMP $128, R2
+ BHI copy_long
+ CMP $32, R2
+ BHI copy32_128
+
+ // Small copies: 17..32 bytes.
+ LDP (R1), (R6, R7)
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ LDP -16(R4), (R12, R13)
+ STP (R6, R7), (R0)
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ STP (R12, R13), -16(R5)
RET
-forwardtail:
- // There are R6 <= 31 bytes remaining to copy.
- // This is large enough to still contain pointers,
- // which must be copied atomically.
- // Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
- TBZ $4, R6, 3(PC) // write 16 bytes if R6&16 != 0
- LDP.P 16(R4), (R8, R10)
- STP.P (R8, R10), 16(R3)
-
- TBZ $3, R6, 3(PC) // write 8 bytes if R6&8 != 0
- MOVD.P 8(R4), R8
- MOVD.P R8, 8(R3)
-
- AND $7, R6
- CBNZ R6, 2(PC)
- RET
-
- ADD R3, R6, R9 // R9 points just past the destination memory
-
-forwardtailloop:
- MOVBU.P 1(R4), R8
- MOVBU.P R8, 1(R3)
- CMP R3, R9
- BNE forwardtailloop
- RET
-
- // Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
copy16:
- ADD R4, R5, R8 // R8 points just past the last source byte
- ADD R3, R5, R9 // R9 points just past the last destination byte
- CMP $8, R5
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ CMP $8, R2
BLT copy7
- MOVD (R4), R6
- MOVD -8(R8), R7
- MOVD R6, (R3)
- MOVD R7, -8(R9)
+ MOVD (R1), R6
+ MOVD -8(R4), R7
+ MOVD R6, (R0)
+ MOVD R7, -8(R5)
RET
copy7:
- TBZ $2, R5, copy3
- MOVWU (R4), R6
- MOVWU -4(R8), R7
- MOVW R6, (R3)
- MOVW R7, -4(R9)
+ TBZ $2, R2, copy3
+ MOVWU (R1), R6
+ MOVWU -4(R4), R7
+ MOVW R6, (R0)
+ MOVW R7, -4(R5)
RET
copy3:
- TBZ $1, R5, copy1
- MOVHU (R4), R6
- MOVHU -2(R8), R7
- MOVH R6, (R3)
- MOVH R7, -2(R9)
+ TBZ $1, R2, copy1
+ MOVHU (R1), R6
+ MOVHU -2(R4), R7
+ MOVH R6, (R0)
+ MOVH R7, -2(R5)
RET
copy1:
- MOVBU (R4), R6
- MOVB R6, (R3)
- RET
-
-backward:
- // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
- // R3 and R4 are advanced to the end of the destination/source buffers
- // respectively and moved back as we copy.
-
- ADD R4, R5, R4 // R4 points just past the last source byte
- ADD R3, R5, R3 // R3 points just past the last destination byte
-
- CBZ R6, nobackwardtail // Do we need to do any byte-by-byte copying?
+ MOVBU (R1), R6
+ MOVB R6, (R0)
- AND $7, R6, R12
- CBZ R12, backwardtaillarge
-
- SUB R12, R3, R9 // R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
- // Copy sub-pointer-size tail.
- MOVBU.W -1(R4), R8
- MOVBU.W R8, -1(R3)
- CMP R9, R3
- BNE backwardtailloop
-
-backwardtaillarge:
- // Do 8/16-byte write if possible.
- // See comment at forwardtail.
- TBZ $3, R6, 3(PC)
- MOVD.W -8(R4), R8
- MOVD.W R8, -8(R3)
+copy0:
+ RET
- TBZ $4, R6, 3(PC)
- LDP.W -16(R4), (R8, R10)
- STP.W (R8, R10), -16(R3)
+ // Medium copies: 33..128 bytes.
+copy32_128:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ LDP (R1), (R6, R7)
+ LDP 16(R1), (R8, R9)
+ LDP -32(R4), (R10, R11)
+ LDP -16(R4), (R12, R13)
+ CMP $64, R2
+ BHI copy128
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
+ RET
-nobackwardtail:
- CBNZ R7, backwardlarge // Do we need to do any doubleword-by-doubleword copying?
+ // Copy 65..128 bytes.
+copy128:
+ LDP 32(R1), (R14, R15)
+ LDP 48(R1), (R16, R17)
+ CMP $96, R2
+ BLS copy96
+ LDP -64(R4), (R2, R3)
+ LDP -48(R4), (R1, R4)
+ STP (R2, R3), -64(R5)
+ STP (R1, R4), -48(R5)
+
+copy96:
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R14, R15), 32(R0)
+ STP (R16, R17), 48(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
RET
-backwardlarge:
- SUB R7, R3, R9 // R9 points at the lowest destination byte
+ // Copy more than 128 bytes.
+copy_long:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ MOVD ZR, R7
+ MOVD ZR, R8
+
+ CMP $1024, R2
+ BLT backward_check
+ // feature detect to decide how to align
+ MOVBU runtime·arm64UseAlignedLoads(SB), R6
+ CBNZ R6, use_aligned_loads
+ MOVD R0, R7
+ MOVD R5, R8
+ B backward_check
+use_aligned_loads:
+ MOVD R1, R7
+ MOVD R4, R8
+ // R7 and R8 are used here for the realignment calculation. In
+ // the use_aligned_loads case, R7 is the src pointer and R8 is
+ // srcend pointer, which is used in the backward copy case.
+ // When doing aligned stores, R7 is the dst pointer and R8 is
+ // the dstend pointer.
+
+backward_check:
+ // Use backward copy if there is an overlap.
+ SUB R1, R0, R14
+ CBZ R14, copy0
+ CMP R2, R14
+ BCC copy_long_backward
+
+ // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+ LDP (R1), (R12, R13) // Load A
+ AND $15, R7, R14 // Calculate the realignment offset
+ SUB R14, R1, R1
+ SUB R14, R0, R3 // move dst back same amount as src
+ ADD R14, R2, R2
+ LDP 16(R1), (R6, R7) // Load B
+ STP (R12, R13), (R0) // Store A
+ LDP 32(R1), (R8, R9) // Load C
+ LDP 48(R1), (R10, R11) // Load D
+ LDP.W 64(R1), (R12, R13) // Load E
+ // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+ SUBS $144, R2, R2
+ BLS copy64_from_end
+
+loop64:
+ STP (R6, R7), 16(R3) // Store B
+ LDP 16(R1), (R6, R7) // Load B (next iteration)
+ STP (R8, R9), 32(R3) // Store C
+ LDP 32(R1), (R8, R9) // Load C
+ STP (R10, R11), 48(R3) // Store D
+ LDP 48(R1), (R10, R11) // Load D
+ STP.W (R12, R13), 64(R3) // Store E
+ LDP.W 64(R1), (R12, R13) // Load E
+ SUBS $64, R2, R2
+ BHI loop64
+
+ // Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+ LDP -64(R4), (R14, R15) // Load F
+ STP (R6, R7), 16(R3) // Store B
+ LDP -48(R4), (R6, R7) // Load G
+ STP (R8, R9), 32(R3) // Store C
+ LDP -32(R4), (R8, R9) // Load H
+ STP (R10, R11), 48(R3) // Store D
+ LDP -16(R4), (R10, R11) // Load I
+ STP (R12, R13), 64(R3) // Store E
+ STP (R14, R15), -64(R5) // Store F
+ STP (R6, R7), -48(R5) // Store G
+ STP (R8, R9), -32(R5) // Store H
+ STP (R10, R11), -16(R5) // Store I
+ RET
-backwardlargeloop:
- LDP -16(R4), (R8, R10)
- STP (R8, R10), -16(R3)
- LDP.W -32(R4), (R11, R12)
- STP.W (R11, R12), -32(R3)
- CMP R9, R3
- BNE backwardlargeloop
+ // Large backward copy for overlapping copies.
+ // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+ LDP -16(R4), (R12, R13)
+ AND $15, R8, R14
+ SUB R14, R4, R4
+ SUB R14, R2, R2
+ LDP -16(R4), (R6, R7)
+ STP (R12, R13), -16(R5)
+ LDP -32(R4), (R8, R9)
+ LDP -48(R4), (R10, R11)
+ LDP.W -64(R4), (R12, R13)
+ SUB R14, R5, R5
+ SUBS $128, R2, R2
+ BLS copy64_from_start
+
+loop64_backward:
+ STP (R6, R7), -16(R5)
+ LDP -16(R4), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP -32(R4), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP -48(R4), (R10, R11)
+ STP.W (R12, R13), -64(R5)
+ LDP.W -64(R4), (R12, R13)
+ SUBS $64, R2, R2
+ BHI loop64_backward
+
+ // Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+ LDP 48(R1), (R2, R3)
+ STP (R6, R7), -16(R5)
+ LDP 32(R1), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP 16(R1), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP (R1), (R10, R11)
+ STP (R12, R13), -64(R5)
+ STP (R2, R3), 48(R0)
+ STP (R6, R7), 32(R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), (R0)
RET
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
32, 64, 128, 256, 512, 1024, 2048, 4096,
}
+var bufSizesOverlap = []int{
+ 32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
func BenchmarkMemmove(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
})
}
+func BenchmarkMemmoveOverlap(b *testing.B) {
+ benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+ x := make([]byte, n+16)
+ for i := 0; i < b.N; i++ {
+ copy(x[16:n+16], x[:n])
+ }
+ })
+}
+
func BenchmarkMemmoveUnalignedDst(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n+1)
})
}
+func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
+ benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+ x := make([]byte, n+16)
+ for i := 0; i < b.N; i++ {
+ copy(x[16:n+16], x[1:n+1])
+ }
+ })
+}
+
func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
x := make([]byte, n)
})
}
+func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
+ benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+ x := make([]byte, n+1)
+ for i := 0; i < b.N; i++ {
+ copy(x[1:n+1], x[:n])
+ }
+ })
+}
+
func TestMemclr(t *testing.T) {
size := 512
if testing.Short() {