runtime: remove the restriction that write barrier ptrs come in pairs

author Keith Randall <khr@golang.org>

Wed, 26 Oct 2022 00:58:07 +0000 (17:58 -0700)

committer Keith Randall <khr@golang.org>

Fri, 17 Feb 2023 22:19:26 +0000 (22:19 +0000)
author Keith Randall <khr@golang.org>
Wed, 26 Oct 2022 00:58:07 +0000 (17:58 -0700)
committer Keith Randall <khr@golang.org>
Fri, 17 Feb 2023 22:19:26 +0000 (22:19 +0000)
diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go

index 3dda480d36baddd3ef28a952a5ef6bbe5ab6a967..96dd0bf93579d60ad18516984886bd0d89a8e709 100644 (file)
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go
@@ -87,6 +87,8 @@ func TestIntendedInlining(t *testing.T) {
                         "(*mspan).markBitsForIndex",
                         "(*muintptr).set",
                         "(*puintptr).set",
+                       "(*wbBuf).get1",
+                       "(*wbBuf).get2",
                 },
                 "runtime/internal/sys": {},
                 "runtime/internal/math": {
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s

index 02179d2ee99ac33a5d91af7bb44e76d88207fa04..a03e5b0fe0c060b22a6c602c02b38e1bb758068c 100644 (file)
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -1377,6 +1377,7 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$28
         // faster than having the caller spill these.
         MOVL    CX, 20(SP)
         MOVL    BX, 24(SP)
+retry:
         // TODO: Consider passing g.m.p in as an argument so they can be shared
         // across a sequence of write barriers.
         get_tls(BX)
@@ -1386,15 +1387,15 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$28
         MOVL    (p_wbBuf+wbBuf_next)(BX), CX
         // Increment wbBuf.next position.
         LEAL    8(CX), CX
-       MOVL    CX, (p_wbBuf+wbBuf_next)(BX)
+       // Is the buffer full?
         CMPL    CX, (p_wbBuf+wbBuf_end)(BX)
+       JA      flush
+       // Commit to the larger buffer.
+       MOVL    CX, (p_wbBuf+wbBuf_next)(BX)
         // Record the write.
         MOVL    AX, -8(CX)      // Record value
         MOVL    (DI), BX        // TODO: This turns bad writes into bad reads.
         MOVL    BX, -4(CX)      // Record *slot
-       // Is the buffer full? (flags set in CMPL above)
-       JEQ     flush
-ret:
         MOVL    20(SP), CX
         MOVL    24(SP), BX
         // Do the write.
@@ -1404,8 +1405,8 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       MOVL    DI, 0(SP)       // Also first argument to wbBufFlush
-       MOVL    AX, 4(SP)       // Also second argument to wbBufFlush
+       MOVL    DI, 0(SP)
+       MOVL    AX, 4(SP)
         // BX already saved
         // CX already saved
         MOVL    DX, 8(SP)
@@ -1413,7 +1414,6 @@ flush:
         MOVL    SI, 16(SP)
         // DI already saved
  
-       // This takes arguments DI and AX
         CALL    runtime·wbBufFlush(SB)
  
         MOVL    0(SP), DI
@@ -1421,7 +1421,7 @@ flush:
         MOVL    8(SP), DX
         MOVL    12(SP), BP
         MOVL    16(SP), SI
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s

index 45afcda38f02e9a282434697e84f9e6f4402bf6f..6acb7ddaef0919a55fda8b3f62308044058e2aed 100644 (file)
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1634,15 +1634,20 @@ TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$112
         // faster than having the caller spill these.
         MOVQ    R12, 96(SP)
         MOVQ    R13, 104(SP)
+retry:
         // TODO: Consider passing g.m.p in as an argument so they can be shared
         // across a sequence of write barriers.
         MOVQ    g_m(R14), R13
         MOVQ    m_p(R13), R13
+       // Get current buffer write position.
         MOVQ    (p_wbBuf+wbBuf_next)(R13), R12
         // Increment wbBuf.next position.
         LEAQ    16(R12), R12
-       MOVQ    R12, (p_wbBuf+wbBuf_next)(R13)
+       // Is the buffer full?
         CMPQ    R12, (p_wbBuf+wbBuf_end)(R13)
+       JA      flush
+       // Commit to the larger buffer.
+       MOVQ    R12, (p_wbBuf+wbBuf_next)(R13)
         // Record the write.
         MOVQ    AX, -16(R12)    // Record value
         // Note: This turns bad pointer writes into bad
@@ -1653,9 +1658,6 @@ TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$112
         // combine the read and the write.
         MOVQ    (DI), R13
         MOVQ    R13, -8(R12)    // Record *slot
-       // Is the buffer full? (flags set in CMPQ above)
-       JEQ     flush
-ret:
         MOVQ    96(SP), R12
         MOVQ    104(SP), R13
         // Do the write.
@@ -1675,8 +1677,8 @@ flush:
         //
         // TODO: We could strike a different balance; e.g., saving X0
         // and not saving GP registers that are less likely to be used.
-       MOVQ    DI, 0(SP)       // Also first argument to wbBufFlush
-       MOVQ    AX, 8(SP)       // Also second argument to wbBufFlush
+       MOVQ    DI, 0(SP)
+       MOVQ    AX, 8(SP)
         MOVQ    BX, 16(SP)
         MOVQ    CX, 24(SP)
         MOVQ    DX, 32(SP)
@@ -1692,7 +1694,6 @@ flush:
         // R14 is g
         MOVQ    R15, 88(SP)
  
-       // This takes arguments DI and AX
         CALL    runtime·wbBufFlush(SB)
  
         MOVQ    0(SP), DI
@@ -1707,7 +1708,7 @@ flush:
         MOVQ    72(SP), R10
         MOVQ    80(SP), R11
         MOVQ    88(SP), R15
-       JMP     ret
+       JMP     retry
  
  // gcWriteBarrierCX is gcWriteBarrier, but with args in DI and CX.
  // Defined as ABIInternal since it does not use the stable Go ABI.
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s

index 591ef2a399fb76711a609b1920d1ae32354439bd..40a6e477928d645f0e81612c1185f200df2c1da7 100644 (file)
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -882,21 +882,22 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
  TEXT runtime·gcWriteBarrier(SB),NOSPLIT|NOFRAME,$0
         // Save the registers clobbered by the fast path.
         MOVM.DB.W       [R0,R1], (R13)
+retry:
         MOVW    g_m(g), R0
         MOVW    m_p(R0), R0
         MOVW    (p_wbBuf+wbBuf_next)(R0), R1
+       MOVW    (p_wbBuf+wbBuf_end)(R0), R11
         // Increment wbBuf.next position.
         ADD     $8, R1
+       // Is the buffer full?
+       CMP     R11, R1
+       BHI     flush
+       // Commit to the larger buffer.
         MOVW    R1, (p_wbBuf+wbBuf_next)(R0)
-       MOVW    (p_wbBuf+wbBuf_end)(R0), R0
-       CMP     R1, R0
         // Record the write.
         MOVW    R3, -8(R1)      // Record value
         MOVW    (R2), R0        // TODO: This turns bad writes into bad reads.
         MOVW    R0, -4(R1)      // Record *slot
-       // Is the buffer full? (flags set in CMP above)
-       B.EQ    flush
-ret:
         MOVM.IA.W       (R13), [R0,R1]
         // Do the write.
         MOVW    R3, (R2)
@@ -911,20 +912,16 @@ flush:
         // R11 is linker temp, so no need to save.
         // R13 is stack pointer.
         // R15 is PC.
-       //
-       // This also sets up R2 and R3 as the arguments to wbBufFlush.
         MOVM.DB.W       [R2-R9,R12], (R13)
         // Save R14 (LR) because the fast path above doesn't save it,
-       // but needs it to RET. This is after the MOVM so it appears below
-       // the arguments in the stack frame.
+       // but needs it to RET.
         MOVM.DB.W       [R14], (R13)
  
-       // This takes arguments R2 and R3.
         CALL    runtime·wbBufFlush(SB)
  
         MOVM.IA.W       (R13), [R14]
         MOVM.IA.W       (R13), [R2-R9,R12]
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s

index 7eb5bcfd21502b0fb18b209b33d65edf0c265daa..bc9e73ffd63f18a3705bf202246a78e8f8287029 100644 (file)
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -1194,7 +1194,7 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
  // - R2 is the destination of the write
  // - R3 is the value being written at R2
  // It clobbers condition codes.
-// It does not clobber any general-purpose registers,
+// It does not clobber any general-purpose registers except R27,
  // but may clobber others (e.g., floating point registers)
  // The act of CALLing gcWriteBarrier will clobber R30 (LR).
  //
@@ -1203,21 +1203,22 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
  TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$200
         // Save the registers clobbered by the fast path.
         STP     (R0, R1), 184(RSP)
+retry:
         MOVD    g_m(g), R0
         MOVD    m_p(R0), R0
-       MOVD    (p_wbBuf+wbBuf_next)(R0), R1
+        MOVD   (p_wbBuf+wbBuf_next)(R0), R1
+        MOVD   (p_wbBuf+wbBuf_end)(R0), R27
         // Increment wbBuf.next position.
         ADD     $16, R1
+       // Is the buffer full?
+       CMP     R27, R1
+       BHI     flush
+       // Commit to the larger buffer.
         MOVD    R1, (p_wbBuf+wbBuf_next)(R0)
-       MOVD    (p_wbBuf+wbBuf_end)(R0), R0
-       CMP     R1, R0
         // Record the write.
         MOVD    R3, -16(R1)     // Record value
         MOVD    (R2), R0        // TODO: This turns bad writes into bad reads.
         MOVD    R0, -8(R1)      // Record *slot
-       // Is the buffer full? (flags set in CMP above)
-       BEQ     flush
-ret:
         LDP     184(RSP), (R0, R1)
         // Do the write.
         MOVD    R3, (R2)
@@ -1227,7 +1228,7 @@ flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
         // R0 and R1 already saved
-       STP     (R2, R3), 1*8(RSP)      // Also first and second arguments to wbBufFlush
+       STP     (R2, R3), 1*8(RSP)
         STP     (R4, R5), 3*8(RSP)
         STP     (R6, R7), 5*8(RSP)
         STP     (R8, R9), 7*8(RSP)
@@ -1246,7 +1247,6 @@ flush:
         // R30 is LR, which was saved by the prologue.
         // R31 is SP.
  
-       // This takes arguments R2 and R3.
         CALL    runtime·wbBufFlush(SB)
         LDP     1*8(RSP), (R2, R3)
         LDP     3*8(RSP), (R4, R5)
@@ -1259,7 +1259,7 @@ flush:
         LDP     17*8(RSP), (R21, R22)
         LDP     19*8(RSP), (R23, R24)
         LDP     21*8(RSP), (R25, R26)
-       JMP     ret
+       JMP     retry
  
  DATA   debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
  GLOBL  debugCallFrameTooLarge<>(SB), RODATA, $20       // Size duplicated below
diff --git a/src/runtime/asm_loong64.s b/src/runtime/asm_loong64.s

index a6ccd196c9a365643740c52fa6c07ec911022074..09a2964511faeb0bda4ee427b050cf9127360a19 100644 (file)
--- a/src/runtime/asm_loong64.s
+++ b/src/runtime/asm_loong64.s
@@ -628,21 +628,21 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$216
         // Save the registers clobbered by the fast path.
         MOVV    R19, 208(R3)
         MOVV    R13, 216(R3)
+retry:
         MOVV    g_m(g), R19
         MOVV    m_p(R19), R19
         MOVV    (p_wbBuf+wbBuf_next)(R19), R13
+       MOVV    (p_wbBuf+wbBuf_end)(R19), R30 // R30 is linker temp register
         // Increment wbBuf.next position.
         ADDV    $16, R13
+       // Is the buffer full?
+       BLTU    R30, R13, flush
+       // Commit to the larger buffer.
         MOVV    R13, (p_wbBuf+wbBuf_next)(R19)
-       MOVV    (p_wbBuf+wbBuf_end)(R19), R19
-       MOVV    R19, R30                // R30 is linker temp register
         // Record the write.
         MOVV    R28, -16(R13)   // Record value
         MOVV    (R27), R19      // TODO: This turns bad writes into bad reads.
         MOVV    R19, -8(R13)    // Record *slot
-       // Is the buffer full?
-       BEQ     R13, R30, flush
-ret:
         MOVV    208(R3), R19
         MOVV    216(R3), R13
         // Do the write.
@@ -652,8 +652,8 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       MOVV    R27, 8(R3)      // Also first argument to wbBufFlush
-       MOVV    R28, 16(R3)     // Also second argument to wbBufFlush
+       MOVV    R27, 8(R3)
+       MOVV    R28, 16(R3)
         // R1 is LR, which was saved by the prologue.
         MOVV    R2, 24(R3)
         // R3 is SP.
@@ -686,8 +686,6 @@ flush:
         // R30 is tmp register.
         MOVV    R31, 200(R3)
  
-
-       // This takes arguments R27 and R28.
         CALL    runtime·wbBufFlush(SB)
  
         MOVV    8(R3), R27
@@ -715,7 +713,7 @@ flush:
         MOVV    184(R3), R26
         MOVV    192(R3), R29
         MOVV    200(R3), R31
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s

index 1abadb9c7df308857a84c30885c47c587f560df6..6f413db84b863d4de1655cea386ca466972c583c 100644 (file)
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -644,21 +644,22 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$192
         // Save the registers clobbered by the fast path.
         MOVV    R1, 184(R29)
         MOVV    R2, 192(R29)
+retry:
         MOVV    g_m(g), R1
         MOVV    m_p(R1), R1
         MOVV    (p_wbBuf+wbBuf_next)(R1), R2
+       MOVV    (p_wbBuf+wbBuf_end)(R1), R23 // R23 is linker temp register
         // Increment wbBuf.next position.
         ADDV    $16, R2
+       // Is the buffer full?
+        SGTU   R2, R23, R23
+       BNE     R23, flush
+       // Commit to the larger buffer.
         MOVV    R2, (p_wbBuf+wbBuf_next)(R1)
-       MOVV    (p_wbBuf+wbBuf_end)(R1), R1
-       MOVV    R1, R23         // R23 is linker temp register
         // Record the write.
         MOVV    R21, -16(R2)    // Record value
         MOVV    (R20), R1       // TODO: This turns bad writes into bad reads.
         MOVV    R1, -8(R2)      // Record *slot
-       // Is the buffer full?
-       BEQ     R2, R23, flush
-ret:
         MOVV    184(R29), R1
         MOVV    192(R29), R2
         // Do the write.
@@ -668,8 +669,8 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       MOVV    R20, 8(R29)     // Also first argument to wbBufFlush
-       MOVV    R21, 16(R29)    // Also second argument to wbBufFlush
+       MOVV    R20, 8(R29)
+       MOVV    R21, 16(R29)
         // R1 already saved
         // R2 already saved
         MOVV    R3, 24(R29)
@@ -702,7 +703,6 @@ flush:
         // R30 is g.
         // R31 is LR, which was saved by the prologue.
  
-       // This takes arguments R20 and R21.
         CALL    runtime·wbBufFlush(SB)
  
         MOVV    8(R29), R20
@@ -727,7 +727,7 @@ flush:
         MOVV    160(R29), R22
         MOVV    168(R29), R24
         MOVV    176(R29), R25
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s

index 877c1bb97b7bcf8bec5dcc287f44ccfc050abf13..2fbbf13672f379e015e88aeb4e6424f9ed6861e0 100644 (file)
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -637,21 +637,22 @@ TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$104
         // Save the registers clobbered by the fast path.
         MOVW    R1, 100(R29)
         MOVW    R2, 104(R29)
+retry:
         MOVW    g_m(g), R1
         MOVW    m_p(R1), R1
         MOVW    (p_wbBuf+wbBuf_next)(R1), R2
+       MOVW    (p_wbBuf+wbBuf_end)(R1), R23 // R23 is linker temp register
         // Increment wbBuf.next position.
         ADD     $8, R2
+       // Is the buffer full?
+       SGTU    R2, R23, R23
+       BNE     R23, flush
+       // Commit to the larger buffer.
         MOVW    R2, (p_wbBuf+wbBuf_next)(R1)
-       MOVW    (p_wbBuf+wbBuf_end)(R1), R1
-       MOVW    R1, R23         // R23 is linker temp register
         // Record the write.
         MOVW    R21, -8(R2)     // Record value
         MOVW    (R20), R1       // TODO: This turns bad writes into bad reads.
         MOVW    R1, -4(R2)      // Record *slot
-       // Is the buffer full?
-       BEQ     R2, R23, flush
-ret:
         MOVW    100(R29), R1
         MOVW    104(R29), R2
         // Do the write.
@@ -661,8 +662,8 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       MOVW    R20, 4(R29)     // Also first argument to wbBufFlush
-       MOVW    R21, 8(R29)     // Also second argument to wbBufFlush
+       MOVW    R20, 4(R29)
+       MOVW    R21, 8(R29)
         // R1 already saved
         // R2 already saved
         MOVW    R3, 12(R29)
@@ -696,7 +697,6 @@ flush:
         // R30 is g.
         // R31 is LR, which was saved by the prologue.
  
-       // This takes arguments R20 and R21.
         CALL    runtime·wbBufFlush(SB)
  
         MOVW    4(R29), R20
@@ -723,7 +723,7 @@ flush:
         MOVW    88(R29), R24
         MOVW    92(R29), R25
         MOVW    96(R29), R28
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s

index 61ff17a934259ce0295d5b1d533105a8cc2e4bec..4a30f38fc9e56b3738c5ac2835f41094c8024310 100644 (file)
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -938,22 +938,23 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
  // but may clobber any other register, *including* R31.
  TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$112
         // The standard prologue clobbers R31.
-       // We use R18 and R19 as scratch registers.
+       // We use R18, R19, and R31 as scratch registers.
+retry:
         MOVD    g_m(g), R18
         MOVD    m_p(R18), R18
         MOVD    (p_wbBuf+wbBuf_next)(R18), R19
+       MOVD    (p_wbBuf+wbBuf_end)(R18), R31
         // Increment wbBuf.next position.
         ADD     $16, R19
+       // Is the buffer full?
+       CMPU    R31, R19
+       BLT     flush
+       // Commit to the larger buffer.
         MOVD    R19, (p_wbBuf+wbBuf_next)(R18)
-       MOVD    (p_wbBuf+wbBuf_end)(R18), R18
-       CMP     R18, R19
         // Record the write.
         MOVD    R21, -16(R19)   // Record value
         MOVD    (R20), R18      // TODO: This turns bad writes into bad reads.
         MOVD    R18, -8(R19)    // Record *slot
-       // Is the buffer full? (flags set in CMP above)
-       BEQ     flush
-ret:
         // Do the write.
         MOVD    R21, (R20)
         RET
@@ -961,8 +962,8 @@ ret:
  flush:
         // Save registers R0 through R15 since these were not saved by the caller.
         // We don't save all registers on ppc64 because it takes too much space.
-       MOVD    R20, (FIXED_FRAME+0)(R1)        // Also first argument to wbBufFlush
-       MOVD    R21, (FIXED_FRAME+8)(R1)        // Also second argument to wbBufFlush
+       MOVD    R20, (FIXED_FRAME+0)(R1)
+       MOVD    R21, (FIXED_FRAME+8)(R1)
         // R0 is always 0, so no need to spill.
         // R1 is SP.
         // R2 is SB.
@@ -981,7 +982,6 @@ flush:
         MOVD    R16, (FIXED_FRAME+96)(R1)
         MOVD    R17, (FIXED_FRAME+104)(R1)
  
-       // This takes arguments R20 and R21.
         CALL    runtime·wbBufFlush(SB)
  
         MOVD    (FIXED_FRAME+0)(R1), R20
@@ -998,7 +998,7 @@ flush:
         MOVD    (FIXED_FRAME+88)(R1), R15
         MOVD    (FIXED_FRAME+96)(R1), R16
         MOVD    (FIXED_FRAME+104)(R1), R17
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s

index 31b81aea12e14560ef5f28934d619af2ba7b2629..4c434ea551113350614c7147110cdeeb3057720e 100644 (file)
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -714,10 +714,10 @@ TEXT ·unspillArgs(SB),NOSPLIT,$0-0
  
  // gcWriteBarrier performs a heap pointer write and informs the GC.
  //
-// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
-// - T0 is the destination of the write
-// - T1 is the value being written at T0.
-// It clobbers R30 (the linker temp register - REG_TMP).
+// gcWriteBarrier does NOT follow the Go ABI. It accepts the
+// number of bytes of buffer needed in X24, and returns a pointer
+// to the buffer spcae in X24.
+// It clobbers X31 aka T6 (the linker temp register - REG_TMP).
  // The act of CALLing gcWriteBarrier will clobber RA (LR).
  // It does not clobber any other general-purpose registers,
  // but may clobber others (e.g., floating point registers).
@@ -725,21 +725,21 @@ TEXT runtime·gcWriteBarrier<ABIInternal>(SB),NOSPLIT,$208
         // Save the registers clobbered by the fast path.
         MOV     A0, 24*8(X2)
         MOV     A1, 25*8(X2)
+retry:
         MOV     g_m(g), A0
         MOV     m_p(A0), A0
         MOV     (p_wbBuf+wbBuf_next)(A0), A1
+       MOV     (p_wbBuf+wbBuf_end)(A0), T6 // T6 is linker temp register (REG_TMP)
         // Increment wbBuf.next position.
         ADD     $16, A1
+       // Is the buffer full?
+       BLTU    T6, A1, flush
+       // Commit to the larger buffer.
         MOV     A1, (p_wbBuf+wbBuf_next)(A0)
-       MOV     (p_wbBuf+wbBuf_end)(A0), A0
-       MOV     A0, T6          // T6 is linker temp register (REG_TMP)
         // Record the write.
         MOV     T1, -16(A1)     // Record value
         MOV     (T0), A0        // TODO: This turns bad writes into bad reads.
         MOV     A0, -8(A1)      // Record *slot
-       // Is the buffer full?
-       BEQ     A1, T6, flush
-ret:
         MOV     24*8(X2), A0
         MOV     25*8(X2), A1
         // Do the write.
@@ -749,15 +749,13 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       MOV     T0, 1*8(X2)     // Also first argument to wbBufFlush
-       MOV     T1, 2*8(X2)     // Also second argument to wbBufFlush
+       MOV     T0, 1*8(X2)
+       MOV     T1, 2*8(X2)
         // X0 is zero register
         // X1 is LR, saved by prologue
         // X2 is SP
         // X3 is GP
         // X4 is TP
-       // X5 is first arg to wbBufFlush (T0)
-       // X6 is second arg to wbBufFlush (T1)
         MOV     X7, 3*8(X2)
         MOV     X8, 4*8(X2)
         MOV     X9, 5*8(X2)
@@ -784,7 +782,6 @@ flush:
         MOV     X30, 23*8(X2)
         // X31 is tmp register.
  
-       // This takes arguments T0 and T1.
         CALL    runtime·wbBufFlush(SB)
  
         MOV     1*8(X2), T0
@@ -811,7 +808,7 @@ flush:
         MOV     22*8(X2), X29
         MOV     23*8(X2), X30
  
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers (ssa/gen/RISCV64Ops.go), but the space for those
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s

index 96b20f43a81bc2e5a6b92394a09307c091b51aad..5332c9b234eeeda0eec43a2659e8ce8300c539f8 100644 (file)
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -790,20 +790,21 @@ TEXT ·checkASM(SB),NOSPLIT,$0-1
  TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$96
         // Save the registers clobbered by the fast path.
         MOVD    R4, 96(R15)
+retry:
         MOVD    g_m(g), R1
         MOVD    m_p(R1), R1
         // Increment wbBuf.next position.
         MOVD    $16, R4
         ADD     (p_wbBuf+wbBuf_next)(R1), R4
+       // Is the buffer full?
+       MOVD    (p_wbBuf+wbBuf_end)(R1), R10
+       CMPUBGT R4, R10, flush
+       // Commit to the larger buffer.
         MOVD    R4, (p_wbBuf+wbBuf_next)(R1)
-       MOVD    (p_wbBuf+wbBuf_end)(R1), R1
         // Record the write.
         MOVD    R3, -16(R4) // Record value
         MOVD    (R2), R10   // TODO: This turns bad writes into bad reads.
         MOVD    R10, -8(R4) // Record *slot
-       // Is the buffer full?
-       CMPBEQ  R4, R1, flush
-ret:
         MOVD    96(R15), R4
         // Do the write.
         MOVD    R3, (R2)
@@ -812,7 +813,7 @@ ret:
  flush:
         // Save all general purpose registers since these could be
         // clobbered by wbBufFlush and were not saved by the caller.
-       STMG    R2, R3, 8(R15)   // set R2 and R3 as arguments for wbBufFlush
+       STMG    R2, R3, 8(R15)
         MOVD    R0, 24(R15)
         // R1 already saved.
         // R4 already saved.
@@ -821,13 +822,12 @@ flush:
         // R14 is LR.
         // R15 is SP.
  
-       // This takes arguments R2 and R3.
         CALL    runtime·wbBufFlush(SB)
  
         LMG     8(R15), R2, R3   // restore R2 - R3
         MOVD    24(R15), R0      // restore R0
         LMG     32(R15), R5, R12 // restore R5 - R12
-       JMP     ret
+       JMP     retry
  
  // Note: these functions use a special calling convention to save generated code space.
  // Arguments are passed in registers, but the space for those arguments are allocated
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s

index e075c7259885999212b4032f5ebb3753ad9a5fc3..6666b554d68e8439ad05402bc22bb10f2bf05263 100644 (file)
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -410,36 +410,52 @@ TEXT runtime·cgocallback(SB), NOSPLIT, $0-24
  // R0: the destination of the write (i64)
  // R1: the value being written (i64)
  TEXT runtime·gcWriteBarrier(SB), NOSPLIT, $16
-       // R3 = g.m
-       MOVD g_m(g), R3
-       // R4 = p
-       MOVD m_p(R3), R4
-       // R5 = wbBuf.next
-       MOVD p_wbBuf+wbBuf_next(R4), R5
-
-       // Record value
-       MOVD R1, 0(R5)
-       // Record *slot
-       MOVD (R0), 8(R5)
-
-       // Increment wbBuf.next
-       Get R5
-       I64Const $16
-       I64Add
-       Set R5
-       MOVD R5, p_wbBuf+wbBuf_next(R4)
-
-       Get R5
-       I64Load (p_wbBuf+wbBuf_end)(R4)
-       I64Eq
-       If
+       Loop
+               // R3 = g.m
+               MOVD g_m(g), R3
+               // R4 = p
+               MOVD m_p(R3), R4
+               // R5 = wbBuf.next
+               MOVD p_wbBuf+wbBuf_next(R4), R5
+
+               // Increment wbBuf.next
+               Get R5
+               I64Const $16
+               I64Add
+               Set R5
+
+               // Is the buffer full?
+               Get R5
+               I64Load (p_wbBuf+wbBuf_end)(R4)
+               I64LeU
+               If
+                       // Commit to the larger buffer.
+                       MOVD R5, p_wbBuf+wbBuf_next(R4)
+
+                       // Back up to write position (wasm stores can't use negative offsets)
+                       Get R5
+                       I64Const $16
+                       I64Sub
+                       Set R5
+
+                       // Record value
+                       MOVD R1, 0(R5)
+                       // Record *slot
+                       MOVD (R0), 8(R5)
+
+                       // Do the write
+                       MOVD R1, (R0)
+
+                       RET
+               End
+
                 // Flush
                 MOVD R0, 0(SP)
                 MOVD R1, 8(SP)
                 CALLNORESUME runtime·wbBufFlush(SB)
-       End
+               MOVD 0(SP), R0
+               MOVD 8(SP), R1
  
-       // Do the write
-       MOVD R1, (R0)
-
-       RET
+               // Retry
+               Br $0
+       End
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go

index 26dfbfc2cc008f01de596e6ae4208fd0fcbac2af..b61bf0b8b2a193fb0a8fdb15c6808a183b77646f 100644 (file)
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -21,9 +21,9 @@ import (
  //go:nosplit
  func atomicwb(ptr *unsafe.Pointer, new unsafe.Pointer) {
         slot := (*uintptr)(unsafe.Pointer(ptr))
-       if !getg().m.p.ptr().wbBuf.putFast(*slot, uintptr(new)) {
-               wbBufFlush()
-       }
+       buf := getg().m.p.ptr().wbBuf.get2()
+       buf[0] = *slot
+       buf[1] = uintptr(new)
  }
  
  // atomicstorep performs *ptr = new atomically and invokes a write barrier.
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go

index a3a8b2e70a5e077729e81d46238ba4c05748c7a6..7c5856d9e7d74e474fa416c67498d6b69a73da85 100644 (file)
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -573,9 +573,8 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
                                 break
                         }
                         dstx := (*uintptr)(unsafe.Pointer(addr))
-                       if !buf.putFast(*dstx, 0) {
-                               wbBufFlush()
-                       }
+                       p := buf.get1()
+                       p[0] = *dstx
                 }
         } else {
                 for {
@@ -585,9 +584,9 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
                         }
                         dstx := (*uintptr)(unsafe.Pointer(addr))
                         srcx := (*uintptr)(unsafe.Pointer(src + (addr - dst)))
-                       if !buf.putFast(*dstx, *srcx) {
-                               wbBufFlush()
-                       }
+                       p := buf.get2()
+                       p[0] = *dstx
+                       p[1] = *srcx
                 }
         }
  }
@@ -617,9 +616,8 @@ func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
                         break
                 }
                 srcx := (*uintptr)(unsafe.Pointer(addr - dst + src))
-               if !buf.putFast(0, *srcx) {
-                       wbBufFlush()
-               }
+               p := buf.get1()
+               p[0] = *srcx
         }
  }
  
@@ -650,14 +648,13 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
                 if *bits&mask != 0 {
                         dstx := (*uintptr)(unsafe.Pointer(dst + i))
                         if src == 0 {
-                               if !buf.putFast(*dstx, 0) {
-                                       wbBufFlush()
-                               }
+                               p := buf.get1()
+                               p[0] = *dstx
                         } else {
                                 srcx := (*uintptr)(unsafe.Pointer(src + i))
-                               if !buf.putFast(*dstx, *srcx) {
-                                       wbBufFlush()
-                               }
+                               p := buf.get2()
+                               p[0] = *dstx
+                               p[1] = *srcx
                         }
                 }
                 mask <<= 1
@@ -709,9 +706,9 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
                 if bits&1 != 0 {
                         dstx := (*uintptr)(unsafe.Pointer(dst + i))
                         srcx := (*uintptr)(unsafe.Pointer(src + i))
-                       if !buf.putFast(*dstx, *srcx) {
-                               wbBufFlush()
-                       }
+                       p := buf.get2()
+                       p[0] = *dstx
+                       p[1] = *srcx
                 }
         }
  }
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go

index 9b92c926752d3b1603dbe1524c1382ba27d58765..4236cfb838429ec2ea7e22d1b6c67355e87649c9 100644 (file)
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -53,15 +53,13 @@ type wbBuf struct {
         // be updated without write barriers.
         end uintptr
  
-       // buf stores a series of pointers to execute write barriers
-       // on. This must be a multiple of wbBufEntryPointers because
-       // the write barrier only checks for overflow once per entry.
-       buf [wbBufEntryPointers * wbBufEntries]uintptr
+       // buf stores a series of pointers to execute write barriers on.
+       buf [wbBufEntries]uintptr
  }
  
  const (
-       // wbBufEntries is the number of write barriers between
-       // flushes of the write barrier buffer.
+       // wbBufEntries is the maximum number of pointers that can be
+       // stored in the write barrier buffer.
         //
         // This trades latency for throughput amortization. Higher
         // values amortize flushing overhead more, but increase the
@@ -69,11 +67,11 @@ const (
         // footprint of the buffer.
         //
         // TODO: What is the latency cost of this? Tune this value.
-       wbBufEntries = 256
+       wbBufEntries = 512
  
-       // wbBufEntryPointers is the number of pointers added to the
-       // buffer by each write barrier.
-       wbBufEntryPointers = 2
+       // Maximum number of entries that we need to ask from the
+       // buffer in a single call.
+       wbMaxEntriesPerCall = 2
  )
  
  // reset empties b by resetting its next and end pointers.
@@ -81,16 +79,15 @@ func (b *wbBuf) reset() {
         start := uintptr(unsafe.Pointer(&b.buf[0]))
         b.next = start
         if testSmallBuf {
-               // For testing, allow two barriers in the buffer. If
-               // we only did one, then barriers of non-heap pointers
-               // would be no-ops. This lets us combine a buffered
-               // barrier with a flush at a later time.
-               b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
+               // For testing, make the buffer smaller but more than
+               // 1 write barrier's worth, so it tests both the
+               // immediate flush and delayed flush cases.
+               b.end = uintptr(unsafe.Pointer(&b.buf[wbMaxEntriesPerCall+1]))
         } else {
                 b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
         }
  
-       if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
+       if (b.end-b.next)%unsafe.Sizeof(b.buf[0]) != 0 {
                 throw("bad write barrier buffer bounds")
         }
  }
@@ -109,13 +106,12 @@ func (b *wbBuf) empty() bool {
         return b.next == uintptr(unsafe.Pointer(&b.buf[0]))
  }
  
-// putFast adds old and new to the write barrier buffer and returns
-// false if a flush is necessary. Callers should use this as:
+// getX returns space in the write barrier buffer to store X pointers.
+// getX will flush the buffer if necessary. Callers should use this as:
  //
  //     buf := &getg().m.p.ptr().wbBuf
-//     if !buf.putFast(old, new) {
-//         wbBufFlush()
-//     }
+//     p := buf.get2()
+//     p[0], p[1] = old, new
  //     ... actual memory write ...
  //
  // The caller must ensure there are no preemption points during the
@@ -125,19 +121,31 @@ func (b *wbBuf) empty() bool {
  // could allow a GC phase change, which could result in missed write
  // barriers.
  //
-// putFast must be nowritebarrierrec to because write barriers here would
+// getX must be nowritebarrierrec to because write barriers here would
  // corrupt the write barrier buffer. It (and everything it calls, if
  // it called anything) has to be nosplit to avoid scheduling on to a
  // different P and a different buffer.
  //
  //go:nowritebarrierrec
  //go:nosplit
-func (b *wbBuf) putFast(old, new uintptr) bool {
+func (b *wbBuf) get1() *[1]uintptr {
+       if b.next+goarch.PtrSize > b.end {
+               wbBufFlush()
+       }
+       p := (*[1]uintptr)(unsafe.Pointer(b.next))
+       b.next += goarch.PtrSize
+       return p
+}
+
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) get2() *[2]uintptr {
+       if b.next+2*goarch.PtrSize > b.end {
+               wbBufFlush()
+       }
         p := (*[2]uintptr)(unsafe.Pointer(b.next))
-       p[0] = old
-       p[1] = new
         b.next += 2 * goarch.PtrSize
-       return b.next != b.end
+       return p
  }
  
  // wbBufFlush flushes the current P's write barrier buffer to the GC
@@ -159,13 +167,6 @@ func wbBufFlush() {
         // Note: Every possible return from this function must reset
         // the buffer's next pointer to prevent buffer overflow.
  
-       // This *must not* modify its arguments because this
-       // function's argument slots do double duty in gcWriteBarrier
-       // as register spill slots. Currently, not modifying the
-       // arguments is sufficient to keep the spill slots unmodified
-       // (which seems unlikely to change since it costs little and
-       // helps with debugging).
-
         if getg().m.dying > 0 {
                 // We're going down. Not much point in write barriers
                 // and this way we can allow write barriers in the
@@ -175,7 +176,7 @@ func wbBufFlush() {
         }
  
         // Switch to the system stack so we don't have to worry about
-       // the untyped stack slots or safe points.
+       // safe points.
         systemstack(func() {
                 wbBufFlush1(getg().m.p.ptr())
         })
author	Keith Randall <khr@golang.org>
	Wed, 26 Oct 2022 00:58:07 +0000 (17:58 -0700)
committer	Keith Randall <khr@golang.org>
	Fri, 17 Feb 2023 22:19:26 +0000 (22:19 +0000)
src/cmd/compile/internal/test/inl_test.go		patch \| blob \| history
src/runtime/asm_386.s		patch \| blob \| history
src/runtime/asm_amd64.s		patch \| blob \| history
src/runtime/asm_arm.s		patch \| blob \| history
src/runtime/asm_arm64.s		patch \| blob \| history
src/runtime/asm_loong64.s		patch \| blob \| history
src/runtime/asm_mips64x.s		patch \| blob \| history
src/runtime/asm_mipsx.s		patch \| blob \| history
src/runtime/asm_ppc64x.s		patch \| blob \| history
src/runtime/asm_riscv64.s		patch \| blob \| history
src/runtime/asm_s390x.s		patch \| blob \| history
src/runtime/asm_wasm.s		patch \| blob \| history
src/runtime/atomic_pointer.go		patch \| blob \| history
src/runtime/mbitmap.go		patch \| blob \| history
src/runtime/mwbbuf.go		patch \| blob \| history