From: Dmitriy Vyukov Date: Thu, 7 Aug 2014 09:34:30 +0000 (+0400) Subject: runtime: convert markallocated from C to Go X-Git-Tag: go1.4beta1~894 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=aac7f1a0d68984f3c776e90f775a7de9649f1a12;p=gostls13.git runtime: convert markallocated from C to Go benchmark old ns/op new ns/op delta BenchmarkMalloc8 28.7 22.4 -21.95% BenchmarkMalloc16 44.8 33.8 -24.55% BenchmarkMallocTypeInfo8 49.0 32.9 -32.86% BenchmarkMallocTypeInfo16 46.7 35.8 -23.34% BenchmarkMallocLargeStruct 907 901 -0.66% BenchmarkGobDecode 13235542 12036851 -9.06% BenchmarkGobEncode 10639699 9539155 -10.34% BenchmarkJSONEncode 25193036 21898922 -13.08% BenchmarkJSONDecode 96104044 89464904 -6.91% Fixes #8452. LGTM=khr R=golang-codereviews, bradfitz, rsc, dave, khr CC=golang-codereviews https://golang.org/cl/122090043 --- diff --git a/src/pkg/runtime/malloc.go b/src/pkg/runtime/malloc.go index e7f23889af..73dc9f20d6 100644 --- a/src/pkg/runtime/malloc.go +++ b/src/pkg/runtime/malloc.go @@ -9,6 +9,8 @@ import ( ) const ( + debugMalloc = false + flagNoScan = 1 << 0 // GC doesn't have to scan object flagNoProfiling = 1 << 1 // must not profile flagNoZero = 1 << 2 // don't zero memory @@ -29,6 +31,22 @@ const ( pageShift = 13 pageSize = 1 << pageShift pageMask = pageSize - 1 + + wordsPerBitmapWord = ptrSize * 8 / 4 + gcBits = 4 + bitsPerPointer = 2 + bitsMask = 1<>shift)&bitMask) != bitBoundary { + gothrow("bad bits in markallocated") + } + var ti, te uintptr + var ptrmask *uint8 + if flags&flagNoScan != 0 { + // bitsDead in the first quadruple means don't scan. + if size == ptrSize { + *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift) + } else { + xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) + *xbitsb = bitAllocated + (bitsDead << 2) + } + goto marked + } + if size == ptrSize { + // It's one word and it has pointers, it must be a pointer. + *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift) + goto marked + } + if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize { + if typ.kind&kindGCProg != 0 { + nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize + masksize := nptr + if masksize%2 != 0 { + masksize *= 2 // repeated + } + masksize = masksize * pointersPerByte / 8 // 4 bits per word + masksize++ // unroll flag in the beginning + if masksize > maxGCMask && typ.gc[1] != 0 { + // If the mask is too large, unroll the program directly + // into the GC bitmap. It's 7 times slower than copying + // from the pre-unrolled mask, but saves 1/16 of type size + // memory for the mask. + mp.ptrarg[0] = x + mp.ptrarg[1] = unsafe.Pointer(typ) + mp.scalararg[0] = uint(size) + mp.scalararg[1] = uint(size0) + onM(&unrollgcproginplace_m) + goto marked + } + ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) + // Check whether the program is already unrolled. + if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 { + mp.ptrarg[0] = unsafe.Pointer(typ) + onM(&unrollgcprog_m) + } + ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte + } else { + ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask + } + if size == 2*ptrSize { + xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) + *xbitsb = *ptrmask | bitAllocated + goto marked + } + te = uintptr(typ.size) / ptrSize + // If the type occupies odd number of words, its mask is repeated. + if te%2 == 0 { + te /= 2 + } + } + if size == 2*ptrSize { + xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) + *xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated + goto marked + } + // Copy pointer bitmask into the bitmap. + for i := uintptr(0); i < size0; i += 2 * ptrSize { + v := uint8((bitsPointer << 2) | (bitsPointer << 6)) + if ptrmask != nil { + v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti)) + ti++ + if ti == te { + ti = 0 + } + } + if i == 0 { + v |= bitAllocated + } + if i+ptrSize == size0 { + v &= ^uint8(bitPtrMask << 4) + } + + off := (uintptr(x) + i - arena_start) / ptrSize + xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize)) + shift := (off % wordsPerBitmapWord) * gcBits + xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) + *xbitsb = v + } + if size0%(2*ptrSize) == 0 && size0 < size { + // Mark the word after last object's word as bitsDead. + off := (uintptr(x) + size0 - arena_start) / ptrSize + xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize)) + shift := (off % wordsPerBitmapWord) * gcBits + xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) + *xbitsb = bitsDead << 2 + } +marked: mp.mallocing = 0 if raceenabled { diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h index 43feef79ed..4b16c55536 100644 --- a/src/pkg/runtime/malloc.h +++ b/src/pkg/runtime/malloc.h @@ -517,7 +517,6 @@ void* runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat); int32 runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s); void runtime·gc(int32 force); uintptr runtime·sweepone(void); -void runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan); void runtime·markspan(void *v, uintptr size, uintptr n, bool leftover); void runtime·unmarkspan(void *v, uintptr size); void runtime·purgecachedstats(MCache*); diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 4637d68bce..8998a871ae 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -1741,7 +1741,7 @@ runtime·wakefing(void) return res; } -// Recursively GC program in prog. +// Recursively unrolls GC program in prog. // mask is where to store the result. // ppos is a pointer to position in mask, in bits. // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise). @@ -1833,11 +1833,20 @@ unrollglobgcprog(byte *prog, uintptr size) return mask; } -static void -unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ) +void +runtime·unrollgcproginplace_m(void) { - uintptr *b, off, shift, pos; + uintptr size, size0, *b, off, shift, pos; byte *arena_start, *prog; + Type *typ; + void *v; + + v = g->m->ptrarg[0]; + typ = g->m->ptrarg[1]; + size = g->m->scalararg[0]; + size0 = g->m->scalararg[1]; + g->m->ptrarg[0] = nil; + g->m->ptrarg[1] = nil; pos = 0; prog = (byte*)typ->gc[1]; @@ -1859,14 +1868,18 @@ unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ) } // Unrolls GC program in typ->gc[1] into typ->gc[0] -static void -unrollgcprog(Type *typ) +void +runtime·unrollgcprog_m(void) { static Lock lock; + Type *typ; byte *mask, *prog; uintptr pos; uint32 x; + typ = g->m->ptrarg[0]; + g->m->ptrarg[0] = nil; + runtime·lock(&lock); mask = (byte*)typ->gc[0]; if(mask[0] == 0) { @@ -1887,110 +1900,6 @@ unrollgcprog(Type *typ) runtime·unlock(&lock); } -void -runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan) -{ - uintptr *b, off, shift, i, ti, te, nptr, masksize; - byte *arena_start, x; - bool *ptrmask; - - arena_start = runtime·mheap.arena_start; - off = (uintptr*)v - (uintptr*)arena_start; - b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = (off % wordsPerBitmapWord) * gcBits; - if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) { - runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b); - runtime·throw("bad bits in markallocated"); - } - - if(!scan) { - // BitsDead in the first quadruple means don't scan. - if(size == PtrSize) - *b = (*b & ~((bitBoundary|bitPtrMask)<gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) { - if(typ->kind&KindGCProg) { - nptr = ROUND(typ->size, PtrSize)/PtrSize; - masksize = nptr; - if(masksize%2) - masksize *= 2; // repeated twice - masksize = masksize*PointersPerByte/8; // 4 bits per word - masksize++; // unroll flag in the beginning - if(masksize > MaxGCMask && typ->gc[1] != 0) { - // If the mask is too large, unroll the program directly - // into the GC bitmap. It's 7 times slower than copying - // from the pre-unrolled mask, but saves 1/16 of type size - // memory for the mask. - unrollgcproginplace(v, size, size0, typ); - return; - } - ptrmask = (byte*)typ->gc[0]; - // check whether the program is already unrolled - if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0) - unrollgcprog(typ); - ptrmask++; // skip the unroll flag byte - } else - ptrmask = (byte*)&typ->gc[0]; // embed mask - if(size == 2*PtrSize) { - ((byte*)b)[shift/8] = ptrmask[0] | bitAllocated; - return; - } - te = typ->size/PtrSize; - // if the type occupies odd number of words, its mask is repeated twice - if((te%2) == 0) - te /= 2; - } - if(size == 2*PtrSize) { - ((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated; - return; - } - // Copy pointer bitmask into the bitmap. - for(i=0; im; - runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0); - mp->ptrarg[0] = nil; - mp->ptrarg[1] = nil; -} - // mark the span of memory at v as having n blocks of the given size. // if leftover is true, there is left over space at the end of the span. void diff --git a/src/pkg/runtime/stubs.go b/src/pkg/runtime/stubs.go index 8a2fc8a97e..fee18f0470 100644 --- a/src/pkg/runtime/stubs.go +++ b/src/pkg/runtime/stubs.go @@ -65,7 +65,9 @@ var ( mprofMalloc_m, gc_m, setFinalizer_m, - markallocated_m mFunction + markallocated_m, + unrollgcprog_m, + unrollgcproginplace_m mFunction ) // memclr clears n bytes starting at ptr. @@ -87,7 +89,12 @@ const ( concurrentSweep = true ) -// in asm_*.s +// Atomic operations to read/write a pointer. +// in stubs.goc +func goatomicloadp(p unsafe.Pointer) unsafe.Pointer // return *p +func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v + +// in stubs.goc // if *p == x { *p = y; return true } else { return false }, atomically //go:noescape func gocas(p *uint32, x uint32, y uint32) bool diff --git a/src/pkg/runtime/stubs.goc b/src/pkg/runtime/stubs.goc index c64e73de05..42a4bf1434 100644 --- a/src/pkg/runtime/stubs.goc +++ b/src/pkg/runtime/stubs.goc @@ -48,6 +48,16 @@ func gonanotime() (r int64) { r = runtime·nanotime(); } +#pragma textflag NOSPLIT +func goatomicloadp(p **byte) (v *byte) { + v = runtime·atomicloadp(p); +} + +#pragma textflag NOSPLIT +func goatomicstorep(p **byte, v *byte) { + runtime·atomicstorep(p, v); +} + #pragma textflag NOSPLIT func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) { ret = runtime·cas(p, x, y);