runtime: improve memmove performance on arm64

author Jonathan Swinney <jswinney@amazon.com>

Fri, 30 Oct 2020 18:46:23 +0000 (18:46 +0000)

committer Cherry Zhang <cherryyz@google.com>

Mon, 2 Nov 2020 15:23:43 +0000 (15:23 +0000)
author Jonathan Swinney <jswinney@amazon.com>
Fri, 30 Oct 2020 18:46:23 +0000 (18:46 +0000)
committer Cherry Zhang <cherryyz@google.com>
Mon, 2 Nov 2020 15:23:43 +0000 (15:23 +0000)
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go

index 2829945af0bf87710a0637a18d4b3803d854b765..0ceedcd7d223ec69fba3860f45d45ac61d66ef92 100644 (file)
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -56,32 +56,34 @@ var ARM struct {
  // The booleans in ARM64 contain the correspondingly named cpu feature bit.
  // The struct is padded to avoid false sharing.
  var ARM64 struct {
-       _           CacheLinePad
-       HasFP       bool
-       HasASIMD    bool
-       HasEVTSTRM  bool
-       HasAES      bool
-       HasPMULL    bool
-       HasSHA1     bool
-       HasSHA2     bool
-       HasCRC32    bool
-       HasATOMICS  bool
-       HasFPHP     bool
-       HasASIMDHP  bool
-       HasCPUID    bool
-       HasASIMDRDM bool
-       HasJSCVT    bool
-       HasFCMA     bool
-       HasLRCPC    bool
-       HasDCPOP    bool
-       HasSHA3     bool
-       HasSM3      bool
-       HasSM4      bool
-       HasASIMDDP  bool
-       HasSHA512   bool
-       HasSVE      bool
-       HasASIMDFHM bool
-       _           CacheLinePad
+       _            CacheLinePad
+       HasFP        bool
+       HasASIMD     bool
+       HasEVTSTRM   bool
+       HasAES       bool
+       HasPMULL     bool
+       HasSHA1      bool
+       HasSHA2      bool
+       HasCRC32     bool
+       HasATOMICS   bool
+       HasFPHP      bool
+       HasASIMDHP   bool
+       HasCPUID     bool
+       HasASIMDRDM  bool
+       HasJSCVT     bool
+       HasFCMA      bool
+       HasLRCPC     bool
+       HasDCPOP     bool
+       HasSHA3      bool
+       HasSM3       bool
+       HasSM4       bool
+       HasASIMDDP   bool
+       HasSHA512    bool
+       HasSVE       bool
+       HasASIMDFHM  bool
+       IsNeoverseN1 bool
+       IsZeus       bool
+       _            CacheLinePad
  }
  
  var MIPS64X struct {
diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go

index 533bea247009faaf3fa86ce513333fecece2967d..8fde39f03e1051ba47c8c9dcbb51a865b78c95e9 100644 (file)
--- a/src/internal/cpu/cpu_arm64.go
+++ b/src/internal/cpu/cpu_arm64.go
@@ -18,6 +18,7 @@ const (
         hwcap_SHA2    = 1 << 6
         hwcap_CRC32   = 1 << 7
         hwcap_ATOMICS = 1 << 8
+       hwcap_CPUID   = 1 << 11
  )
  
  func doinit() {
@@ -28,6 +29,8 @@ func doinit() {
                 {Name: "sha2", Feature: &ARM64.HasSHA2},
                 {Name: "crc32", Feature: &ARM64.HasCRC32},
                 {Name: "atomics", Feature: &ARM64.HasATOMICS},
+               {Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1},
+               {Name: "isZeus", Feature: &ARM64.IsZeus},
         }
  
         switch GOOS {
@@ -40,12 +43,32 @@ func doinit() {
                 ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1)
                 ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2)
                 ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32)
+               ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID)
  
                 // The Samsung S9+ kernel reports support for atomics, but not all cores
                 // actually support them, resulting in SIGILL. See issue #28431.
                 // TODO(elias.naur): Only disable the optimization on bad chipsets on android.
                 ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android"
  
+               // Check to see if executing on a NeoverseN1 and in order to do that,
+               // check the AUXV for the CPUID bit. The getMIDR function executes an
+               // instruction which would normally be an illegal instruction, but it's
+               // trapped by the kernel, the value sanitized and then returned. Without
+               // the CPUID bit the kernel will not trap the instruction and the process
+               // will be terminated with SIGILL.
+               if ARM64.HasCPUID {
+                       midr := getMIDR()
+                       part_num := uint16((midr >> 4) & 0xfff)
+                       implementor := byte((midr >> 24) & 0xff)
+
+                       if implementor == 'A' && part_num == 0xd0c {
+                               ARM64.IsNeoverseN1 = true
+                       }
+                       if implementor == 'A' && part_num == 0xd40 {
+                               ARM64.IsZeus = true
+                       }
+               }
+
         case "freebsd":
                 // Retrieve info from system register ID_AA64ISAR0_EL1.
                 isar0 := getisar0()
@@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool {
  }
  
  func getisar0() uint64
+
+func getMIDR() uint64
diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s

index d85914973f94ae5a85b3fcdb1c9a52a598091cde..d6e7f4437391018e08b8e6210cdf4714259ff3b5 100644 (file)
--- a/src/internal/cpu/cpu_arm64.s
+++ b/src/internal/cpu/cpu_arm64.s
@@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0
         MRS     ID_AA64ISAR0_EL1, R0
         MOVD    R0, ret+0(FP)
         RET
+
+// func getMIDR() uint64
+TEXT ·getMIDR(SB), NOSPLIT, $0-8
+       MRS     MIDR_EL1, R0
+       MOVD    R0, ret+0(FP)
+       RET
diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go

new file mode 100644 (file)

index 0000000..7576bef
--- /dev/null
+++ b/src/runtime/cpuflags_arm64.go
@@ -0,0 +1,17 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+       "internal/cpu"
+)
+
+var arm64UseAlignedLoads bool
+
+func init() {
+       if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
+               arm64UseAlignedLoads = true
+       }
+}
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s

index dbb7e9a28a03f0ebc099cf9f96747b5759450f49..43d27629e5bd13435107324502f43d0d70a7d792 100644 (file)
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -6,152 +6,236 @@
  
  // See memmove Go doc for important implementation constraints.
  
+// Register map
+//
+// dstin  R0
+// src    R1
+// count  R2
+// dst    R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data   R6-R17
+// tmp1   R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
  // func memmove(to, from unsafe.Pointer, n uintptr)
  TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-       MOVD    to+0(FP), R3
-       MOVD    from+8(FP), R4
-       MOVD    n+16(FP), R5
-       CBNZ    R5, check
-       RET
+       MOVD    to+0(FP), R0
+       MOVD    from+8(FP), R1
+       MOVD    n+16(FP), R2
+       CBZ     R2, copy0
  
-check:
-       CMP     $16, R5
+       // Small copies: 1..16 bytes
+       CMP     $16, R2
         BLE     copy16
  
-       AND     $~31, R5, R7    // R7 is N&~31
-       SUB     R7, R5, R6      // R6 is N&31
-
-       CMP     R3, R4
-       BLT     backward
-
-       // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
-       // R3 and R4 are advanced as we copy.
-
-       // (There may be implementations of armv8 where copying by bytes until
-       // at least one of source or dest is word aligned is a worthwhile
-       // optimization, but the on the one tested so far (xgene) it did not
-       // make a significance difference.)
-
-       CBZ     R7, noforwardlarge      // Do we need to do any quadword copying?
-
-       ADD     R3, R7, R9      // R9 points just past where we copy by word
-
-forwardlargeloop:
-       // Copy 32 bytes at a time.
-       LDP.P   32(R4), (R8, R10)
-       STP.P   (R8, R10), 32(R3)
-       LDP     -16(R4), (R11, R12)
-       STP     (R11, R12), -16(R3)
-       SUB     $32, R7, R7
-       CBNZ    R7, forwardlargeloop
-
-noforwardlarge:
-       CBNZ    R6, forwardtail         // Do we need to copy any tail bytes?
+       // Large copies
+       CMP     $128, R2
+       BHI     copy_long
+       CMP     $32, R2
+       BHI     copy32_128
+
+       // Small copies: 17..32 bytes.
+       LDP     (R1), (R6, R7)
+       ADD     R1, R2, R4          // R4 points just past the last source byte
+       LDP     -16(R4), (R12, R13)
+       STP     (R6, R7), (R0)
+       ADD     R0, R2, R5          // R5 points just past the last destination byte
+       STP     (R12, R13), -16(R5)
         RET
  
-forwardtail:
-       // There are R6 <= 31 bytes remaining to copy.
-       // This is large enough to still contain pointers,
-       // which must be copied atomically.
-       // Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
-       TBZ     $4, R6, 3(PC)   // write 16 bytes if R6&16 != 0
-       LDP.P   16(R4), (R8, R10)
-       STP.P   (R8, R10), 16(R3)
-
-       TBZ     $3, R6, 3(PC)   // write 8 bytes if R6&8 != 0
-       MOVD.P  8(R4), R8
-       MOVD.P  R8, 8(R3)
-
-       AND     $7, R6
-       CBNZ    R6, 2(PC)
-       RET
-
-       ADD     R3, R6, R9      // R9 points just past the destination memory
-
-forwardtailloop:
-       MOVBU.P 1(R4), R8
-       MOVBU.P R8, 1(R3)
-       CMP     R3, R9
-       BNE     forwardtailloop
-       RET
-
-       // Small copies: 1..16 bytes.
+// Small copies: 1..16 bytes.
  copy16:
-       ADD     R4, R5, R8      // R8 points just past the last source byte
-       ADD     R3, R5, R9      // R9 points just past the last destination byte
-       CMP     $8, R5
+       ADD     R1, R2, R4 // R4 points just past the last source byte
+       ADD     R0, R2, R5 // R5 points just past the last destination byte
+       CMP     $8, R2
         BLT     copy7
-       MOVD    (R4), R6
-       MOVD    -8(R8), R7
-       MOVD    R6, (R3)
-       MOVD    R7, -8(R9)
+       MOVD    (R1), R6
+       MOVD    -8(R4), R7
+       MOVD    R6, (R0)
+       MOVD    R7, -8(R5)
         RET
  
  copy7:
-       TBZ     $2, R5, copy3
-       MOVWU   (R4), R6
-       MOVWU   -4(R8), R7
-       MOVW    R6, (R3)
-       MOVW    R7, -4(R9)
+       TBZ     $2, R2, copy3
+       MOVWU   (R1), R6
+       MOVWU   -4(R4), R7
+       MOVW    R6, (R0)
+       MOVW    R7, -4(R5)
         RET
  
  copy3:
-       TBZ     $1, R5, copy1
-       MOVHU   (R4), R6
-       MOVHU   -2(R8), R7
-       MOVH    R6, (R3)
-       MOVH    R7, -2(R9)
+       TBZ     $1, R2, copy1
+       MOVHU   (R1), R6
+       MOVHU   -2(R4), R7
+       MOVH    R6, (R0)
+       MOVH    R7, -2(R5)
         RET
  
  copy1:
-       MOVBU   (R4), R6
-       MOVB    R6, (R3)
-       RET
-
-backward:
-       // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
-       // R3 and R4 are advanced to the end of the destination/source buffers
-       // respectively and moved back as we copy.
-
-       ADD     R4, R5, R4      // R4 points just past the last source byte
-       ADD     R3, R5, R3      // R3 points just past the last destination byte
-
-       CBZ     R6, nobackwardtail      // Do we need to do any byte-by-byte copying?
+       MOVBU   (R1), R6
+       MOVB    R6, (R0)
  
-       AND     $7, R6, R12
-       CBZ     R12, backwardtaillarge
-
-       SUB     R12, R3, R9     // R9 points at the lowest destination byte that should be copied by byte.
-backwardtailloop:
-       // Copy sub-pointer-size tail.
-       MOVBU.W -1(R4), R8
-       MOVBU.W R8, -1(R3)
-       CMP     R9, R3
-       BNE     backwardtailloop
-
-backwardtaillarge:
-       // Do 8/16-byte write if possible.
-       // See comment at forwardtail.
-       TBZ     $3, R6, 3(PC)
-       MOVD.W  -8(R4), R8
-       MOVD.W  R8, -8(R3)
+copy0:
+       RET
  
-       TBZ     $4, R6, 3(PC)
-       LDP.W   -16(R4), (R8, R10)
-       STP.W   (R8, R10), -16(R3)
+       // Medium copies: 33..128 bytes.
+copy32_128:
+       ADD     R1, R2, R4          // R4 points just past the last source byte
+       ADD     R0, R2, R5          // R5 points just past the last destination byte
+       LDP     (R1), (R6, R7)
+       LDP     16(R1), (R8, R9)
+       LDP     -32(R4), (R10, R11)
+       LDP     -16(R4), (R12, R13)
+       CMP     $64, R2
+       BHI     copy128
+       STP     (R6, R7), (R0)
+       STP     (R8, R9), 16(R0)
+       STP     (R10, R11), -32(R5)
+       STP     (R12, R13), -16(R5)
+       RET
  
-nobackwardtail:
-       CBNZ     R7, backwardlarge      // Do we need to do any doubleword-by-doubleword copying?
+       // Copy 65..128 bytes.
+copy128:
+       LDP     32(R1), (R14, R15)
+       LDP     48(R1), (R16, R17)
+       CMP     $96, R2
+       BLS     copy96
+       LDP     -64(R4), (R2, R3)
+       LDP     -48(R4), (R1, R4)
+       STP     (R2, R3), -64(R5)
+       STP     (R1, R4), -48(R5)
+
+copy96:
+       STP     (R6, R7), (R0)
+       STP     (R8, R9), 16(R0)
+       STP     (R14, R15), 32(R0)
+       STP     (R16, R17), 48(R0)
+       STP     (R10, R11), -32(R5)
+       STP     (R12, R13), -16(R5)
         RET
  
-backwardlarge:
-       SUB     R7, R3, R9      // R9 points at the lowest destination byte
+       // Copy more than 128 bytes.
+copy_long:
+       ADD     R1, R2, R4 // R4 points just past the last source byte
+       ADD     R0, R2, R5 // R5 points just past the last destination byte
+       MOVD    ZR, R7
+       MOVD    ZR, R8
+
+       CMP     $1024, R2
+       BLT     backward_check
+       // feature detect to decide how to align
+       MOVBU   runtime·arm64UseAlignedLoads(SB), R6
+       CBNZ    R6, use_aligned_loads
+       MOVD    R0, R7
+       MOVD    R5, R8
+       B       backward_check
+use_aligned_loads:
+       MOVD    R1, R7
+       MOVD    R4, R8
+       // R7 and R8 are used here for the realignment calculation. In
+       // the use_aligned_loads case, R7 is the src pointer and R8 is
+       // srcend pointer, which is used in the backward copy case.
+       // When doing aligned stores, R7 is the dst pointer and R8 is
+       // the dstend pointer.
+
+backward_check:
+       // Use backward copy if there is an overlap.
+       SUB     R1, R0, R14
+       CBZ     R14, copy0
+       CMP     R2, R14
+       BCC     copy_long_backward
+
+       // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+       LDP     (R1), (R12, R13)     // Load  A
+       AND     $15, R7, R14         // Calculate the realignment offset
+       SUB     R14, R1, R1
+       SUB     R14, R0, R3          // move dst back same amount as src
+       ADD     R14, R2, R2
+       LDP     16(R1), (R6, R7)     // Load   B
+       STP     (R12, R13), (R0)     // Store A
+       LDP     32(R1), (R8, R9)     // Load    C
+       LDP     48(R1), (R10, R11)   // Load     D
+       LDP.W   64(R1), (R12, R13)   // Load      E
+       // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+       SUBS    $144, R2, R2
+       BLS     copy64_from_end
+
+loop64:
+       STP     (R6, R7), 16(R3)     // Store  B
+       LDP     16(R1), (R6, R7)     // Load   B (next iteration)
+       STP     (R8, R9), 32(R3)     // Store   C
+       LDP     32(R1), (R8, R9)     // Load    C
+       STP     (R10, R11), 48(R3)   // Store    D
+       LDP     48(R1), (R10, R11)   // Load     D
+       STP.W   (R12, R13), 64(R3)   // Store     E
+       LDP.W   64(R1), (R12, R13)   // Load      E
+       SUBS    $64, R2, R2
+       BHI     loop64
+
+       // Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+       LDP     -64(R4), (R14, R15)  // Load       F
+       STP     (R6, R7), 16(R3)     // Store  B
+       LDP     -48(R4), (R6, R7)    // Load        G
+       STP     (R8, R9), 32(R3)     // Store   C
+       LDP     -32(R4), (R8, R9)    // Load         H
+       STP     (R10, R11), 48(R3)   // Store    D
+       LDP     -16(R4), (R10, R11)  // Load          I
+       STP     (R12, R13), 64(R3)   // Store     E
+       STP     (R14, R15), -64(R5)  // Store      F
+       STP     (R6, R7), -48(R5)    // Store       G
+       STP     (R8, R9), -32(R5)    // Store        H
+       STP     (R10, R11), -16(R5)  // Store         I
+       RET
  
-backwardlargeloop:
-       LDP     -16(R4), (R8, R10)
-       STP     (R8, R10), -16(R3)
-       LDP.W   -32(R4), (R11, R12)
-       STP.W   (R11, R12), -32(R3)
-       CMP     R9, R3
-       BNE     backwardlargeloop
+       // Large backward copy for overlapping copies.
+       // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+       LDP     -16(R4), (R12, R13)
+       AND     $15, R8, R14
+       SUB     R14, R4, R4
+       SUB     R14, R2, R2
+       LDP     -16(R4), (R6, R7)
+       STP     (R12, R13), -16(R5)
+       LDP     -32(R4), (R8, R9)
+       LDP     -48(R4), (R10, R11)
+       LDP.W   -64(R4), (R12, R13)
+       SUB     R14, R5, R5
+       SUBS    $128, R2, R2
+       BLS     copy64_from_start
+
+loop64_backward:
+       STP     (R6, R7), -16(R5)
+       LDP     -16(R4), (R6, R7)
+       STP     (R8, R9), -32(R5)
+       LDP     -32(R4), (R8, R9)
+       STP     (R10, R11), -48(R5)
+       LDP     -48(R4), (R10, R11)
+       STP.W   (R12, R13), -64(R5)
+       LDP.W   -64(R4), (R12, R13)
+       SUBS    $64, R2, R2
+       BHI     loop64_backward
+
+       // Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+       LDP     48(R1), (R2, R3)
+       STP     (R6, R7), -16(R5)
+       LDP     32(R1), (R6, R7)
+       STP     (R8, R9), -32(R5)
+       LDP     16(R1), (R8, R9)
+       STP     (R10, R11), -48(R5)
+       LDP     (R1), (R10, R11)
+       STP     (R12, R13), -64(R5)
+       STP     (R2, R3), 48(R0)
+       STP     (R6, R7), 32(R0)
+       STP     (R8, R9), 16(R0)
+       STP     (R10, R11), (R0)
         RET
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go

index b549433f71ce66e47be0d17cbac323a94e8de77b..7c9d2ada45fd78d3d88be35fe4616a085e2b3715 100644 (file)
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -286,6 +286,9 @@ var bufSizes = []int{
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
         32, 64, 128, 256, 512, 1024, 2048, 4096,
  }
+var bufSizesOverlap = []int{
+       32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
  
  func BenchmarkMemmove(b *testing.B) {
         benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
@@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) {
         })
  }
  
+func BenchmarkMemmoveOverlap(b *testing.B) {
+       benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+               x := make([]byte, n+16)
+               for i := 0; i < b.N; i++ {
+                       copy(x[16:n+16], x[:n])
+               }
+       })
+}
+
  func BenchmarkMemmoveUnalignedDst(b *testing.B) {
         benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
                 x := make([]byte, n+1)
@@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) {
         })
  }
  
+func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
+       benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+               x := make([]byte, n+16)
+               for i := 0; i < b.N; i++ {
+                       copy(x[16:n+16], x[1:n+1])
+               }
+       })
+}
+
  func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
         benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
                 x := make([]byte, n)
@@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
         })
  }
  
+func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
+       benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+               x := make([]byte, n+1)
+               for i := 0; i < b.N; i++ {
+                       copy(x[1:n+1], x[:n])
+               }
+       })
+}
+
  func TestMemclr(t *testing.T) {
         size := 512
         if testing.Short() {
author	Jonathan Swinney <jswinney@amazon.com>
	Fri, 30 Oct 2020 18:46:23 +0000 (18:46 +0000)
committer	Cherry Zhang <cherryyz@google.com>
	Mon, 2 Nov 2020 15:23:43 +0000 (15:23 +0000)
src/internal/cpu/cpu.go		patch \| blob \| history
src/internal/cpu/cpu_arm64.go		patch \| blob \| history
src/internal/cpu/cpu_arm64.s		patch \| blob \| history
src/runtime/cpuflags_arm64.go	[new file with mode: 0644]	patch \| blob
src/runtime/memmove_arm64.s		patch \| blob \| history
src/runtime/memmove_test.go		patch \| blob \| history