runtime: improve arm64 memclr implementation

author Balaram Makam <bmakam.qdt@qualcommdatacenter.com>

Tue, 5 Dec 2017 22:51:10 +0000 (17:51 -0500)

committer Cherry Zhang <cherryyz@google.com>

Wed, 14 Mar 2018 18:20:40 +0000 (18:20 +0000)
author Balaram Makam <bmakam.qdt@qualcommdatacenter.com>
Tue, 5 Dec 2017 22:51:10 +0000 (17:51 -0500)
committer Cherry Zhang <cherryyz@google.com>
Wed, 14 Mar 2018 18:20:40 +0000 (18:20 +0000)
diff --git a/src/cmd/asm/internal/arch/arch.go b/src/cmd/asm/internal/arch/arch.go

index cd028f6bee60636a59fbae49b9141db6f88e93e6..5ee415028a2e9653316e4938422683278f770500 100644 (file)
--- a/src/cmd/asm/internal/arch/arch.go
+++ b/src/cmd/asm/internal/arch/arch.go
@@ -260,6 +260,7 @@ func archArm64() *Arch {
         register["SPSel"] = arm64.REG_SPSel
         register["DAIFSet"] = arm64.REG_DAIFSet
         register["DAIFClr"] = arm64.REG_DAIFClr
+       register["DCZID_EL0"] = arm64.REG_DCZID_EL0
         register["PLDL1KEEP"] = arm64.REG_PLDL1KEEP
         register["PLDL1STRM"] = arm64.REG_PLDL1STRM
         register["PLDL2KEEP"] = arm64.REG_PLDL2KEEP
diff --git a/src/cmd/asm/internal/asm/testdata/arm64enc.s b/src/cmd/asm/internal/asm/testdata/arm64enc.s

index 79baded1da3c6d5b3dfbab22f3ed40db9bc332f1..11d82d8166b1d4e37751bdcc04ca94c2bc459ef3 100644 (file)
--- a/src/cmd/asm/internal/asm/testdata/arm64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64enc.s
@@ -251,6 +251,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$-8
     MSR $6, DAIFClr                            // ff4603d5
     MRS ELR_EL1, R8                            // 284038d5
     MSR R16, ELR_EL1                           // 304018d5
+   MRS DCZID_EL0, R3                          // e3003bd5
     MSUBW R1, R1, R12, R5                      // 8585011b
     MSUB R19, R16, R26, R2                     // 42c3139b
     MULW R26, R5, R22                          // b67c1a1b
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go

index b70426af2dec276b19af36d588c828ca6e49b787..1a2313f61ea57df7356efbd58b67a7405b456497 100644 (file)
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -208,6 +208,7 @@ const (
         REG_SPSel
         REG_DAIFSet
         REG_DAIFClr
+       REG_DCZID_EL0
         REG_PLDL1KEEP
         REG_PLDL1STRM
         REG_PLDL2KEEP
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go

index 7ba56429d35ac4bddd7e68f118e61ad677d077bf..3b7ad244933222608ebb91419e01fc2a98310c68 100644 (file)
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -634,6 +634,7 @@ var systemreg = []struct {
         enc uint32
  }{
         {REG_ELR_EL1, 8<<16 | 4<<12 | 1<<5},
+       {REG_DCZID_EL0, 3<<19 | 3<<16 | 7<<5},
  }
  
  var prfopfield = []struct {
diff --git a/src/cmd/internal/obj/arm64/list7.go b/src/cmd/internal/obj/arm64/list7.go

index cf92120cbb384af8cc6462dc1628b7c632be5956..37c61d22555ae66ec9e820d38cca2d5e4ba7dc45 100644 (file)
--- a/src/cmd/internal/obj/arm64/list7.go
+++ b/src/cmd/internal/obj/arm64/list7.go
@@ -134,6 +134,8 @@ func rconv(r int) string {
                 return "DAIFSet"
         case r == REG_DAIFClr:
                 return "DAIFClr"
+       case r == REG_DCZID_EL0:
+               return "DCZID_EL0"
         case r == REG_PLDL1KEEP:
                 return "PLDL1KEEP"
         case r == REG_PLDL1STRM:
diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s

index bf954e047f478026d57aba2aa1d3770bf7aa2ed7..159cac248682015027b511d87c4e847f9e1a1750 100644 (file)
--- a/src/runtime/memclr_arm64.s
+++ b/src/runtime/memclr_arm64.s
@@ -8,52 +8,175 @@
  TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
         MOVD    ptr+0(FP), R0
         MOVD    n+8(FP), R1
-       // If size is less than 16 bytes, use tail_zero to zero what remains
+
         CMP     $16, R1
-       BLT     tail_zero
-       // Get buffer offset into 16 byte aligned address for better performance
-       ANDS    $15, R0, ZR
-       BNE     unaligned_to_16
-aligned_to_16:
-       LSR     $4, R1, R2
+       // If n is equal to 16 bytes, use zero_exact_16 to zero
+       BEQ     zero_exact_16
+
+       // If n is greater than 16 bytes, use zero_by_16 to zero
+       BHI     zero_by_16
+
+       // n is less than 16 bytes
+       ADD     R1, R0, R7
+       TBZ     $3, R1, less_than_8
+       MOVD    ZR, (R0)
+       MOVD    ZR, -8(R7)
+       RET
+
+less_than_8:
+       TBZ     $2, R1, less_than_4
+       MOVW    ZR, (R0)
+       MOVW    ZR, -4(R7)
+       RET
+
+less_than_4:
+       CBZ     R1, ending
+       MOVB    ZR, (R0)
+       TBZ     $1, R1, ending
+       MOVH    ZR, -2(R7)
+
+ending:
+       RET
+
+zero_exact_16:
+       // n is exactly 16 bytes
+       STP     (ZR, ZR), (R0)
+       RET
+
  zero_by_16:
-       STP.P   (ZR, ZR), 16(R0)
-       SUBS    $1, R2, R2
-       BNE     zero_by_16
+       // n greater than 16 bytes, check if the start address is aligned
+       NEG     R0, R4
+       ANDS    $15, R4, R4
+       // Try zeroing using zva if the start address is aligned with 16
+       BEQ     try_zva
+
+       // Non-aligned store
+       STP     (ZR, ZR), (R0)
+       // Make the destination aligned
+       SUB     R4, R1, R1
+       ADD     R4, R0, R0
+       B       try_zva
+
+tail_maybe_long:
+       CMP     $64, R1
+       BHS     no_zva
  
+tail63:
+       ANDS    $48, R1, R3
+       BEQ     last16
+       CMPW    $32, R3
+       BEQ     last48
+       BLT     last32
+       STP.P   (ZR, ZR), 16(R0)
+last48:
+       STP.P   (ZR, ZR), 16(R0)
+last32:
+       STP.P   (ZR, ZR), 16(R0)
+       // The last store length is at most 16, so it is safe to use
+       // stp to write last 16 bytes
+last16:
         ANDS    $15, R1, R1
-       BEQ     ending
+       CBZ     R1, last_end
+       ADD     R1, R0, R0
+       STP     (ZR, ZR), -16(R0)
+last_end:
+       RET
  
-       // Zero buffer with size=R1 < 16
-tail_zero:
-       TBZ     $3, R1, tail_zero_4
-       MOVD.P  ZR, 8(R0)
+no_zva:
+       SUB     $16, R0, R0
+       SUB     $64, R1, R1
  
-tail_zero_4:
-       TBZ     $2, R1, tail_zero_2
-       MOVW.P  ZR, 4(R0)
+loop_64:
+       STP     (ZR, ZR), 16(R0)
+       STP     (ZR, ZR), 32(R0)
+       STP     (ZR, ZR), 48(R0)
+       STP.W   (ZR, ZR), 64(R0)
+       SUBS    $64, R1, R1
+       BGE     loop_64
+       ANDS    $63, R1, ZR
+       ADD     $16, R0, R0
+       BNE     tail63
+       RET
  
-tail_zero_2:
-       TBZ     $1, R1, tail_zero_1
-       MOVH.P  ZR, 2(R0)
+try_zva:
+       // Try using the ZVA feature to zero entire cache lines
+       // It is not meaningful to use ZVA if the block size is less than 64,
+       // so make sure that n is greater than or equal to 64
+       CMP     $63, R1
+       BLE     tail63
  
-tail_zero_1:
-       TBZ     $0, R1, ending
-       MOVB    ZR, (R0)
+       CMP     $128, R1
+       // Ensure n is at least 128 bytes, so that there is enough to copy after
+       // alignment.
+       BLT     no_zva
+       // Check if ZVA is allowed from user code, and if so get the block size
+       MOVW    block_size<>(SB), R5
+       TBNZ    $31, R5, no_zva
+       CBNZ    R5, zero_by_line
+        // DCZID_EL0 bit assignments
+        // [63:5] Reserved
+        // [4]    DZP, if bit set DC ZVA instruction is prohibited, else permitted
+        // [3:0]  log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words
+       MRS     DCZID_EL0, R3
+       TBZ     $4, R3, init
+       // ZVA not available
+       MOVW    $~0, R5
+       MOVW    R5, block_size<>(SB)
+       B       no_zva
  
-ending:
+init:
+       MOVW    $4, R9
+       ANDW    $15, R3, R5
+       LSLW    R5, R9, R5
+       MOVW    R5, block_size<>(SB)
+
+       ANDS    $63, R5, R9
+       // Block size is less than 64.
+       BNE     no_zva
+
+zero_by_line:
+       CMP     R5, R1
+       // Not enough memory to reach alignment
+       BLO     no_zva
+       SUB     $1, R5, R6
+       NEG     R0, R4
+       ANDS    R6, R4, R4
+       // Already aligned
+       BEQ     aligned
+
+       // check there is enough to copy after alignment
+       SUB     R4, R1, R3
+
+       // Check that the remaining length to ZVA after alignment
+       // is greater than 64.
+       CMP     $64, R3
+       CCMP    GE, R3, R5, $10  // condition code GE, NZCV=0b1010
+       BLT     no_zva
+
+       // We now have at least 64 bytes to zero, update n
+       MOVD    R3, R1
+
+loop_zva_prolog:
+       STP     (ZR, ZR), (R0)
+       STP     (ZR, ZR), 16(R0)
+       STP     (ZR, ZR), 32(R0)
+       SUBS    $64, R4, R4
+       STP     (ZR, ZR), 48(R0)
+       ADD     $64, R0, R0
+       BGE     loop_zva_prolog
+
+       ADD     R4, R0, R0
+
+aligned:
+       SUB     R5, R1, R1
+
+loop_zva:
+       WORD    $0xd50b7420 // DC ZVA, R0
+       ADD     R5, R0, R0
+       SUBS    R5, R1, R1
+       BHS     loop_zva
+       ANDS    R6, R1, R1
+       BNE     tail_maybe_long
         RET
  
-unaligned_to_16:
-       MOVD    R0, R2
-head_loop:
-       MOVBU.P ZR, 1(R0)
-       ANDS    $15, R0, ZR
-       BNE     head_loop
-       // Adjust length for what remains
-       SUB     R2, R0, R3
-       SUB     R3, R1
-       // If size is less than 16 bytes, use tail_zero to zero what remains
-       CMP     $16, R1
-       BLT     tail_zero
-       B       aligned_to_16
+GLOBL block_size<>(SB), NOPTR, $8
author	Balaram Makam <bmakam.qdt@qualcommdatacenter.com>
	Tue, 5 Dec 2017 22:51:10 +0000 (17:51 -0500)
committer	Cherry Zhang <cherryyz@google.com>
	Wed, 14 Mar 2018 18:20:40 +0000 (18:20 +0000)
src/cmd/asm/internal/arch/arch.go		patch \| blob \| history
src/cmd/asm/internal/asm/testdata/arm64enc.s		patch \| blob \| history
src/cmd/internal/obj/arm64/a.out.go		patch \| blob \| history
src/cmd/internal/obj/arm64/asm7.go		patch \| blob \| history
src/cmd/internal/obj/arm64/list7.go		patch \| blob \| history
src/runtime/memclr_arm64.s		patch \| blob \| history