From 56fb8350c835d1ccf0e6cdb8f753c85e2e0748a8 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Wed, 18 Sep 2024 01:58:50 +0000 Subject: [PATCH] runtime: don't call span.heapBits in writeHeapBitsSmall MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For whatever reason, span.heapBits is kind of slow. It accounts for about a quarter of the cost of writeHeapBitsSmall, which is absurd. We get a nice speed improvement for small allocations by eliminating this call. │ before │ after │ │ sec/op │ sec/op vs base │ MallocTypeInfo16-4 29.47n ± 1% 27.02n ± 1% -8.31% (p=0.002 n=6) Change-Id: I6270e26902e5a9254cf1503fac81c3c799c59d6a Reviewed-on: https://go-review.googlesource.com/c/go/+/614255 Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Auto-Submit: Michael Knyszek --- src/runtime/malloc.go | 10 ++++++++++ src/runtime/mbitmap.go | 15 +++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 7076ced453..71dda120d4 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -427,8 +427,15 @@ func mallocinit() { // Check that the minimum size (exclusive) for a malloc header is also // a size class boundary. This is important to making sure checks align // across different parts of the runtime. + // + // While we're here, also check to make sure all these size classes' + // span sizes are one page. Some code relies on this. minSizeForMallocHeaderIsSizeClass := false + sizeClassesUpToMinSizeForMallocHeaderAreOnePage := true for i := 0; i < len(class_to_size); i++ { + if class_to_allocnpages[i] > 1 { + sizeClassesUpToMinSizeForMallocHeaderAreOnePage = false + } if minSizeForMallocHeader == uintptr(class_to_size[i]) { minSizeForMallocHeaderIsSizeClass = true break @@ -437,6 +444,9 @@ func mallocinit() { if !minSizeForMallocHeaderIsSizeClass { throw("min size of malloc header is not a size class boundary") } + if !sizeClassesUpToMinSizeForMallocHeaderAreOnePage { + throw("expected all size classes up to min size for malloc header to fit in one-page spans") + } // Check that the pointer bitmap for all small sizes without a malloc header // fits in a word. if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize { diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index a25995f46f..bae008b432 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -642,8 +642,7 @@ func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize // The objects here are always really small, so a single load is sufficient. src0 := readUintptr(typ.GCData) - // Create repetitions of the bitmap if we have a small array. - bits := span.elemsize / goarch.PtrSize + // Create repetitions of the bitmap if we have a small slice backing store. scanSize = typ.PtrBytes src := src0 switch typ.Size_ { @@ -658,19 +657,23 @@ func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize // Since we're never writing more than one uintptr's worth of bits, we're either going // to do one or two writes. - dst := span.heapBits() + dst := unsafe.Pointer(span.base() + pageSize - pageSize/goarch.PtrSize/8) o := (x - span.base()) / goarch.PtrSize i := o / ptrBits j := o % ptrBits + bits := span.elemsize / goarch.PtrSize if j+bits > ptrBits { // Two writes. bits0 := ptrBits - j bits1 := bits - bits0 - dst[i+0] = dst[i+0]&(^uintptr(0)>>bits0) | (src << j) - dst[i+1] = dst[i+1]&^((1<> bits0) + dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize)) + dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize)) + *dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j) + *dst1 = (*dst1)&^((1<> bits0) } else { // One write. - dst[i] = (dst[i] &^ (((1 << bits) - 1) << j)) | (src << j) + dst := (*uintptr)(add(dst, i*goarch.PtrSize)) + *dst = (*dst)&^(((1<