[release-branch.go1.12] runtime: ensure memmove write pointer atomically on ARM64

author Cherry Zhang <cherryyz@google.com>

Fri, 27 Dec 2019 17:02:00 +0000 (12:02 -0500)

committer Alexander Rakoczy <alex@golang.org>

Wed, 8 Jan 2020 22:22:22 +0000 (22:22 +0000)
author Cherry Zhang <cherryyz@google.com>
Fri, 27 Dec 2019 17:02:00 +0000 (12:02 -0500)
committer Alexander Rakoczy <alex@golang.org>
Wed, 8 Jan 2020 22:22:22 +0000 (22:22 +0000)
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s

index dcbead8cf4bfb48e4961b3916b6ec37068a98fbd..4b6b4965afc4b4778674a66f74ee1e6b9dae4e26 100644 (file)
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -22,7 +22,7 @@ check:
         CMP     R3, R4
         BLT     backward
  
-       // Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+       // Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
         // R3 and R4 are advanced as we copy.
  
          // (There may be implementations of armv8 where copying by bytes until
@@ -30,11 +30,12 @@ check:
          // optimization, but the on the one tested so far (xgene) it did not
          // make a significance difference.)
  
-       CBZ     R7, noforwardlarge      // Do we need to do any doubleword-by-doubleword copying?
+       CBZ     R7, noforwardlarge      // Do we need to do any quadword copying?
  
         ADD     R3, R7, R9      // R9 points just past where we copy by word
  
  forwardlargeloop:
+       // Copy 32 bytes at a time.
         LDP.P   32(R4), (R8, R10)
         STP.P   (R8, R10), 32(R3)
         LDP     -16(R4), (R11, R12)
@@ -43,10 +44,26 @@ forwardlargeloop:
         CBNZ    R7, forwardlargeloop
  
  noforwardlarge:
-       CBNZ    R6, forwardtail         // Do we need to do any byte-by-byte copying?
+       CBNZ    R6, forwardtail         // Do we need to copy any tail bytes?
         RET
  
  forwardtail:
+       // There are R6 <= 31 bytes remaining to copy.
+       // This is large enough to still contain pointers,
+       // which must be copied atomically.
+       // Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
+       TBZ     $4, R6, 3(PC)   // write 16 bytes if R6&16 != 0
+       LDP.P   16(R4), (R8, R10)
+       STP.P   (R8, R10), 16(R3)
+
+       TBZ     $3, R6, 3(PC)   // write 8 bytes if R6&8 != 0
+       MOVD.P  8(R4), R8
+       MOVD.P  R8, 8(R3)
+
+       AND     $7, R6
+       CBNZ    R6, 2(PC)
+       RET
+
         ADD     R3, R6, R9      // R9 points just past the destination memory
  
  forwardtailloop:
@@ -90,7 +107,7 @@ copy1:
         RET
  
  backward:
-       // Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
+       // Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
         // R3 and R4 are advanced to the end of the destination/source buffers
         // respectively and moved back as we copy.
  
@@ -99,13 +116,28 @@ backward:
  
         CBZ     R6, nobackwardtail      // Do we need to do any byte-by-byte copying?
  
-       SUB     R6, R3, R9      // R9 points at the lowest destination byte that should be copied by byte.
+       AND     $7, R6, R12
+       CBZ     R12, backwardtaillarge
+
+       SUB     R12, R3, R9     // R9 points at the lowest destination byte that should be copied by byte.
  backwardtailloop:
+       // Copy sub-pointer-size tail.
         MOVBU.W -1(R4), R8
         MOVBU.W R8, -1(R3)
         CMP     R9, R3
         BNE     backwardtailloop
  
+backwardtaillarge:
+       // Do 8/16-byte write if possible.
+       // See comment at forwardtail.
+       TBZ     $3, R6, 3(PC)
+       MOVD.W  -8(R4), R8
+       MOVD.W  R8, -8(R3)
+
+       TBZ     $4, R6, 3(PC)
+       LDP.W   -16(R4), (R8, R10)
+       STP.W   (R8, R10), -16(R3)
+
  nobackwardtail:
         CBNZ     R7, backwardlarge      // Do we need to do any doubleword-by-doubleword copying?
         RET
author	Cherry Zhang <cherryyz@google.com>
	Fri, 27 Dec 2019 17:02:00 +0000 (12:02 -0500)
committer	Alexander Rakoczy <alex@golang.org>
	Wed, 8 Jan 2020 22:22:22 +0000 (22:22 +0000)