From: Michael Anthony Knyszek Date: Thu, 13 Nov 2025 18:29:23 +0000 (+0000) Subject: runtime: put AddCleanup cleanup arguments in their own allocation X-Git-Tag: go1.26rc1~278 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=1bb1f2bf0c;p=gostls13.git runtime: put AddCleanup cleanup arguments in their own allocation Currently, AddCleanup just creates a simple closure that calls `cleanup(arg)` as the actual cleanup function tracked internally. However, the argument ends up getting its own allocation. If it's tiny, then it can also end up sharing a tiny allocation slot with the object we're adding the cleanup to. Since the closure is a GC root, we can end up with cleanups that never fire. This change refactors the AddCleanup machinery to make the storage for the argument separate and explicit. With that in place, it explicitly allocates 16 bytes of storage for tiny arguments to side-step the tiny allocator. One would think this would cause an increase in memory use and more bytes allocated, but that's actually wrong! It turns out that the current "simple closure" actually creates _two_ closures. By making the argument passing explicit, we eliminate one layer of closures, so this actually results in a slightly faster AddCleanup overall and 16 bytes less memory allocated. goos: linux goarch: amd64 pkg: runtime cpu: AMD EPYC 7B13 │ before.bench │ after.bench │ │ sec/op │ sec/op vs base │ AddCleanupAndStop-64 124.5n ± 2% 103.7n ± 2% -16.71% (p=0.002 n=6) │ before.bench │ after.bench │ │ B/op │ B/op vs base │ AddCleanupAndStop-64 48.00 ± 0% 32.00 ± 0% -33.33% (p=0.002 n=6) │ before.bench │ after.bench │ │ allocs/op │ allocs/op vs base │ AddCleanupAndStop-64 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.002 n=6) This change does, however, does add 16 bytes of overhead to the cleanup special itself, and makes each cleanup block entry 24 bytes instead of 8 bytes. This means the overall memory overhead delta with this change is neutral, and we just have a faster cleanup. (Cleanup block entries are transient, so I suspect any increase in memory overhead there is negligible.) Together with CL 719960, fixes #76007. Change-Id: I81bf3e44339e71c016c30d80bb4ee151c8263d5c Reviewed-on: https://go-review.googlesource.com/c/go/+/720321 Reviewed-by: Cherry Mui Auto-Submit: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- diff --git a/src/runtime/mcleanup.go b/src/runtime/mcleanup.go index 383217aa05..fc71af9f3f 100644 --- a/src/runtime/mcleanup.go +++ b/src/runtime/mcleanup.go @@ -72,8 +72,9 @@ import ( // pass the object to the [KeepAlive] function after the last point // where the object must remain reachable. func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { - // Explicitly force ptr to escape to the heap. + // Explicitly force ptr and cleanup to escape to the heap. ptr = abi.Escape(ptr) + cleanup = abi.Escape(cleanup) // The pointer to the object must be valid. if ptr == nil { @@ -82,7 +83,8 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { usptr := uintptr(unsafe.Pointer(ptr)) // Check that arg is not equal to ptr. - if kind := abi.TypeOf(arg).Kind(); kind == abi.Pointer || kind == abi.UnsafePointer { + argType := abi.TypeOf(arg) + if kind := argType.Kind(); kind == abi.Pointer || kind == abi.UnsafePointer { if unsafe.Pointer(ptr) == *((*unsafe.Pointer)(unsafe.Pointer(&arg))) { panic("runtime.AddCleanup: ptr is equal to arg, cleanup will never run") } @@ -98,12 +100,23 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { return Cleanup{} } - fn := func() { - cleanup(arg) + // Create new storage for the argument. + var argv *S + if size := unsafe.Sizeof(arg); size < maxTinySize && argType.PtrBytes == 0 { + // Side-step the tiny allocator to avoid liveness issues, since this box + // will be treated like a root by the GC. We model the box as an array of + // uintptrs to guarantee maximum allocator alignment. + // + // TODO(mknyszek): Consider just making space in cleanupFn for this. The + // unfortunate part of this is it would grow specialCleanup by 16 bytes, so + // while there wouldn't be an allocation, *every* cleanup would take the + // memory overhead hit. + box := new([maxTinySize / goarch.PtrSize]uintptr) + argv = (*S)(unsafe.Pointer(box)) + } else { + argv = new(S) } - // Closure must escape. - fv := *(**funcval)(unsafe.Pointer(&fn)) - fv = abi.Escape(fv) + *argv = arg // Find the containing object. base, _, _ := findObject(usptr, 0, 0) @@ -120,7 +133,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { gcCleanups.createGs() } - id := addCleanup(unsafe.Pointer(ptr), fv) + id := addCleanup(unsafe.Pointer(ptr), cleanupFn{ + // Instantiate a caller function to call the cleanup, that is cleanup(*argv). + // + // TODO(mknyszek): This allocates because the generic dictionary argument + // gets closed over, but callCleanup doesn't even use the dictionary argument, + // so theoretically that could be removed, eliminating an allocation. + call: callCleanup[S], + fn: *(**funcval)(unsafe.Pointer(&cleanup)), + arg: unsafe.Pointer(argv), + }) if debug.checkfinalizers != 0 { cleanupFn := *(**funcval)(unsafe.Pointer(&cleanup)) setCleanupContext(unsafe.Pointer(ptr), abi.TypeFor[T](), sys.GetCallerPC(), cleanupFn.fn, id) @@ -131,6 +153,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup { } } +// callCleanup is a helper for calling cleanups in a polymorphic way. +// +// In practice, all it does is call fn(*arg). arg must be a *T. +// +//go:noinline +func callCleanup[T any](fn *funcval, arg unsafe.Pointer) { + cleanup := *(*func(T))(unsafe.Pointer(&fn)) + cleanup(*(*T)(arg)) +} + // Cleanup is a handle to a cleanup call for a specific object. type Cleanup struct { // id is the unique identifier for the cleanup within the arena. @@ -216,7 +248,17 @@ const cleanupBlockSize = 512 // that the cleanup queue does not grow during marking (but it can shrink). type cleanupBlock struct { cleanupBlockHeader - cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / goarch.PtrSize]*funcval + cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / unsafe.Sizeof(cleanupFn{})]cleanupFn +} + +var cleanupFnPtrMask = [...]uint8{0b111} + +// cleanupFn represents a cleanup function with it's argument, yet to be called. +type cleanupFn struct { + // call is an adapter function that understands how to safely call fn(*arg). + call func(*funcval, unsafe.Pointer) + fn *funcval // cleanup function passed to AddCleanup. + arg unsafe.Pointer // pointer to argument to pass to cleanup function. } var cleanupBlockPtrMask [cleanupBlockSize / goarch.PtrSize / 8]byte @@ -245,8 +287,8 @@ type cleanupBlockHeader struct { // // Must only be called if the GC is in the sweep phase (gcphase == _GCoff), // because it does not synchronize with the garbage collector. -func (b *cleanupBlock) enqueue(fn *funcval) bool { - b.cleanups[b.n] = fn +func (b *cleanupBlock) enqueue(c cleanupFn) bool { + b.cleanups[b.n] = c b.n++ return b.full() } @@ -375,7 +417,7 @@ func (q *cleanupQueue) tryTakeWork() bool { // enqueue queues a single cleanup for execution. // // Called by the sweeper, and only the sweeper. -func (q *cleanupQueue) enqueue(fn *funcval) { +func (q *cleanupQueue) enqueue(c cleanupFn) { mp := acquirem() pp := mp.p.ptr() b := pp.cleanups @@ -396,7 +438,7 @@ func (q *cleanupQueue) enqueue(fn *funcval) { } pp.cleanups = b } - if full := b.enqueue(fn); full { + if full := b.enqueue(c); full { q.full.push(&b.lfnode) pp.cleanups = nil q.addWork(1) @@ -641,7 +683,8 @@ func runCleanups() { gcCleanups.beginRunningCleanups() for i := 0; i < int(b.n); i++ { - fn := b.cleanups[i] + c := b.cleanups[i] + b.cleanups[i] = cleanupFn{} var racectx uintptr if raceenabled { @@ -650,20 +693,15 @@ func runCleanups() { // the same goroutine. // // Synchronize on fn. This would fail to find races on the - // closed-over values in fn (suppose fn is passed to multiple - // AddCleanup calls) if fn was not unique, but it is. Update - // the synchronization on fn if you intend to optimize it - // and store the cleanup function and cleanup argument on the - // queue directly. - racerelease(unsafe.Pointer(fn)) + // closed-over values in fn (suppose arg is passed to multiple + // AddCleanup calls) if arg was not unique, but it is. + racerelease(unsafe.Pointer(c.arg)) racectx = raceEnterNewCtx() - raceacquire(unsafe.Pointer(fn)) + raceacquire(unsafe.Pointer(c.arg)) } // Execute the next cleanup. - cleanup := *(*func())(unsafe.Pointer(&fn)) - cleanup() - b.cleanups[i] = nil + c.call(c.fn, c.arg) if raceenabled { // Restore the old context. diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index dd76973c62..c9234c5084 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -204,7 +204,7 @@ func gcMarkRootCheck() { }) } -// ptrmask for an allocation containing a single pointer. +// oneptrmask for an allocation containing a single pointer. var oneptrmask = [...]uint8{1} // markroot scans the i'th root. @@ -251,7 +251,7 @@ func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 { // N.B. This only needs to synchronize with cleanup execution, which only resets these blocks. // All cleanup queueing happens during sweep. n := uintptr(atomic.Load(&cb.n)) - scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*goarch.PtrSize, &cleanupBlockPtrMask[0], gcw, nil) + scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*unsafe.Sizeof(cleanupFn{}), &cleanupBlockPtrMask[0], gcw, nil) } case work.baseSpans <= i && i < work.baseStacks: @@ -489,7 +489,7 @@ func gcScanFinalizer(spf *specialfinalizer, s *mspan, gcw *gcWork) { // gcScanCleanup scans the relevant parts of a cleanup special as a root. func gcScanCleanup(spc *specialCleanup, gcw *gcWork) { // The special itself is a root. - scanblock(uintptr(unsafe.Pointer(&spc.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil) + scanblock(uintptr(unsafe.Pointer(&spc.cleanup)), unsafe.Sizeof(cleanupFn{}), &cleanupFnPtrMask[0], gcw, nil) } // gcAssistAlloc performs GC work to make gp's assist debt positive. diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 3334099092..08a0057be7 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -2161,7 +2161,7 @@ func removefinalizer(p unsafe.Pointer) { type specialCleanup struct { _ sys.NotInHeap special special - fn *funcval + cleanup cleanupFn // Globally unique ID for the cleanup, obtained from mheap_.cleanupID. id uint64 } @@ -2170,14 +2170,18 @@ type specialCleanup struct { // cleanups are allowed on an object, and even the same pointer. // A cleanup id is returned which can be used to uniquely identify // the cleanup. -func addCleanup(p unsafe.Pointer, f *funcval) uint64 { +func addCleanup(p unsafe.Pointer, c cleanupFn) uint64 { + // TODO(mknyszek): Consider pooling specialCleanups on the P + // so we don't have to take the lock every time. Just locking + // is a considerable part of the cost of AddCleanup. This + // would also require reserving some cleanup IDs on the P. lock(&mheap_.speciallock) s := (*specialCleanup)(mheap_.specialCleanupAlloc.alloc()) mheap_.cleanupID++ // Increment first. ID 0 is reserved. id := mheap_.cleanupID unlock(&mheap_.speciallock) s.special.kind = _KindSpecialCleanup - s.fn = f + s.cleanup = c s.id = id mp := acquirem() @@ -2187,17 +2191,16 @@ func addCleanup(p unsafe.Pointer, f *funcval) uint64 { // situation where it's possible that markrootSpans // has already run but mark termination hasn't yet. if gcphase != _GCoff { - gcw := &mp.p.ptr().gcw // Mark the cleanup itself, since the // special isn't part of the GC'd heap. - scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil) + gcScanCleanup(s, &mp.p.ptr().gcw) } releasem(mp) - // Keep f alive. There's a window in this function where it's - // only reachable via the special while the special hasn't been - // added to the specials list yet. This is similar to a bug + // Keep c and its referents alive. There's a window in this function + // where it's only reachable via the special while the special hasn't + // been added to the specials list yet. This is similar to a bug // discovered for weak handles, see #70455. - KeepAlive(f) + KeepAlive(c) return id } @@ -2800,7 +2803,7 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) { // Cleanups, unlike finalizers, do not resurrect the objects // they're attached to, so we only need to pass the cleanup // function, not the object. - gcCleanups.enqueue(sc.fn) + gcCleanups.enqueue(sc.cleanup) lock(&mheap_.speciallock) mheap_.specialCleanupAlloc.free(unsafe.Pointer(sc)) unlock(&mheap_.speciallock)