]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: convert markallocated from C to Go
authorDmitriy Vyukov <dvyukov@google.com>
Thu, 7 Aug 2014 09:34:30 +0000 (13:34 +0400)
committerDmitriy Vyukov <dvyukov@google.com>
Thu, 7 Aug 2014 09:34:30 +0000 (13:34 +0400)
benchmark                      old ns/op     new ns/op     delta
BenchmarkMalloc8               28.7          22.4          -21.95%
BenchmarkMalloc16              44.8          33.8          -24.55%
BenchmarkMallocTypeInfo8       49.0          32.9          -32.86%
BenchmarkMallocTypeInfo16      46.7          35.8          -23.34%
BenchmarkMallocLargeStruct     907           901           -0.66%
BenchmarkGobDecode             13235542      12036851      -9.06%
BenchmarkGobEncode             10639699      9539155       -10.34%
BenchmarkJSONEncode            25193036      21898922      -13.08%
BenchmarkJSONDecode            96104044      89464904      -6.91%

Fixes #8452.

LGTM=khr
R=golang-codereviews, bradfitz, rsc, dave, khr
CC=golang-codereviews
https://golang.org/cl/122090043

src/pkg/runtime/malloc.go
src/pkg/runtime/malloc.h
src/pkg/runtime/mgc0.c
src/pkg/runtime/stubs.go
src/pkg/runtime/stubs.goc

index e7f23889af1ea3e470b9e48e69d433640af098aa..73dc9f20d65b90187ab611f931b2c85e28f066ce 100644 (file)
@@ -9,6 +9,8 @@ import (
 )
 
 const (
+       debugMalloc = false
+
        flagNoScan      = 1 << 0 // GC doesn't have to scan object
        flagNoProfiling = 1 << 1 // must not profile
        flagNoZero      = 1 << 2 // don't zero memory
@@ -29,6 +31,22 @@ const (
        pageShift = 13
        pageSize  = 1 << pageShift
        pageMask  = pageSize - 1
+
+       wordsPerBitmapWord = ptrSize * 8 / 4
+       gcBits             = 4
+       bitsPerPointer     = 2
+       bitsMask           = 1<<bitsPerPointer - 1
+       pointersPerByte    = 8 / bitsPerPointer
+       bitPtrMask         = bitsMask << 2
+       maxGCMask          = 0 // disabled because wastes several bytes of memory
+       bitsDead           = 0
+       bitsPointer        = 2
+
+       bitMiddle    = 0
+       bitBoundary  = 1
+       bitAllocated = 2
+       bitMarked    = 3
+       bitMask      = bitMiddle | bitBoundary | bitAllocated | bitMarked
 )
 
 // All zero-sized allocations return a pointer to this byte.
@@ -168,14 +186,112 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
                size = uintptr(s.elemsize)
        }
 
-       // TODO: write markallocated in Go
-       mp.ptrarg[0] = x
-       mp.scalararg[0] = uint(size)
-       mp.scalararg[1] = uint(size0)
-       mp.ptrarg[1] = unsafe.Pointer(typ)
-       mp.scalararg[2] = uint(flags & flagNoScan)
-       onM(&markallocated_m)
+       // From here till marked label marking the object as allocated
+       // and storing type info in the GC bitmap.
+       arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+       off := (uintptr(x) - arena_start) / ptrSize
+       xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+       shift := (off % wordsPerBitmapWord) * gcBits
+       if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary {
+               gothrow("bad bits in markallocated")
+       }
 
+       var ti, te uintptr
+       var ptrmask *uint8
+       if flags&flagNoScan != 0 {
+               // bitsDead in the first quadruple means don't scan.
+               if size == ptrSize {
+                       *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift)
+               } else {
+                       xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+                       *xbitsb = bitAllocated + (bitsDead << 2)
+               }
+               goto marked
+       }
+       if size == ptrSize {
+               // It's one word and it has pointers, it must be a pointer.
+               *xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift)
+               goto marked
+       }
+       if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
+               if typ.kind&kindGCProg != 0 {
+                       nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+                       masksize := nptr
+                       if masksize%2 != 0 {
+                               masksize *= 2 // repeated
+                       }
+                       masksize = masksize * pointersPerByte / 8 // 4 bits per word
+                       masksize++                                // unroll flag in the beginning
+                       if masksize > maxGCMask && typ.gc[1] != 0 {
+                               // If the mask is too large, unroll the program directly
+                               // into the GC bitmap. It's 7 times slower than copying
+                               // from the pre-unrolled mask, but saves 1/16 of type size
+                               // memory for the mask.
+                               mp.ptrarg[0] = x
+                               mp.ptrarg[1] = unsafe.Pointer(typ)
+                               mp.scalararg[0] = uint(size)
+                               mp.scalararg[1] = uint(size0)
+                               onM(&unrollgcproginplace_m)
+                               goto marked
+                       }
+                       ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+                       // Check whether the program is already unrolled.
+                       if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
+                               mp.ptrarg[0] = unsafe.Pointer(typ)
+                               onM(&unrollgcprog_m)
+                       }
+                       ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+               } else {
+                       ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
+               }
+               if size == 2*ptrSize {
+                       xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+                       *xbitsb = *ptrmask | bitAllocated
+                       goto marked
+               }
+               te = uintptr(typ.size) / ptrSize
+               // If the type occupies odd number of words, its mask is repeated.
+               if te%2 == 0 {
+                       te /= 2
+               }
+       }
+       if size == 2*ptrSize {
+               xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+               *xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated
+               goto marked
+       }
+       // Copy pointer bitmask into the bitmap.
+       for i := uintptr(0); i < size0; i += 2 * ptrSize {
+               v := uint8((bitsPointer << 2) | (bitsPointer << 6))
+               if ptrmask != nil {
+                       v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
+                       ti++
+                       if ti == te {
+                               ti = 0
+                       }
+               }
+               if i == 0 {
+                       v |= bitAllocated
+               }
+               if i+ptrSize == size0 {
+                       v &= ^uint8(bitPtrMask << 4)
+               }
+
+               off := (uintptr(x) + i - arena_start) / ptrSize
+               xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+               shift := (off % wordsPerBitmapWord) * gcBits
+               xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+               *xbitsb = v
+       }
+       if size0%(2*ptrSize) == 0 && size0 < size {
+               // Mark the word after last object's word as bitsDead.
+               off := (uintptr(x) + size0 - arena_start) / ptrSize
+               xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
+               shift := (off % wordsPerBitmapWord) * gcBits
+               xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
+               *xbitsb = bitsDead << 2
+       }
+marked:
        mp.mallocing = 0
 
        if raceenabled {
index 43feef79ed4aa05d86e58b96bd17c809552742f6..4b16c5553632f695e4a3236ba5b30d41acaa6e88 100644 (file)
@@ -517,7 +517,6 @@ void*       runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
 int32  runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void   runtime·gc(int32 force);
 uintptr        runtime·sweepone(void);
-void   runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
 void   runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
 void   runtime·unmarkspan(void *v, uintptr size);
 void   runtime·purgecachedstats(MCache*);
index 4637d68bce7bba36d45295012f9e7893058e4200..8998a871aea9c1b8c393f507f71dad869c9c26cf 100644 (file)
@@ -1741,7 +1741,7 @@ runtime·wakefing(void)
        return res;
 }
 
-// Recursively GC program in prog.
+// Recursively unrolls GC program in prog.
 // mask is where to store the result.
 // ppos is a pointer to position in mask, in bits.
 // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
@@ -1833,11 +1833,20 @@ unrollglobgcprog(byte *prog, uintptr size)
        return mask;
 }
 
-static void
-unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
+void
+runtime·unrollgcproginplace_m(void)
 {
-       uintptr *b, off, shift, pos;
+       uintptr size, size0, *b, off, shift, pos;
        byte *arena_start, *prog;
+       Type *typ;
+       void *v;
+
+       v = g->m->ptrarg[0];
+       typ = g->m->ptrarg[1];
+       size = g->m->scalararg[0];
+       size0 = g->m->scalararg[1];
+       g->m->ptrarg[0] = nil;
+       g->m->ptrarg[1] = nil;
 
        pos = 0;
        prog = (byte*)typ->gc[1];
@@ -1859,14 +1868,18 @@ unrollgcproginplace(void *v, uintptr size, uintptr size0, Type *typ)
 }
 
 // Unrolls GC program in typ->gc[1] into typ->gc[0]
-static void
-unrollgcprog(Type *typ)
+void
+runtime·unrollgcprog_m(void)
 {
        static Lock lock;
+       Type *typ;
        byte *mask, *prog;
        uintptr pos;
        uint32 x;
 
+       typ = g->m->ptrarg[0];
+       g->m->ptrarg[0] = nil;
+
        runtime·lock(&lock);
        mask = (byte*)typ->gc[0];
        if(mask[0] == 0) {
@@ -1887,110 +1900,6 @@ unrollgcprog(Type *typ)
        runtime·unlock(&lock);
 }
 
-void
-runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool scan)
-{
-       uintptr *b, off, shift, i, ti, te, nptr, masksize;
-       byte *arena_start, x;
-       bool *ptrmask;
-
-       arena_start = runtime·mheap.arena_start;
-       off = (uintptr*)v - (uintptr*)arena_start;
-       b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-       shift = (off % wordsPerBitmapWord) * gcBits;
-       if(Debug && (((*b)>>shift)&bitMask) != bitBoundary) {
-               runtime·printf("runtime: bad bits in markallocated (%p) b=%p[%p]\n", v, b, *b);
-               runtime·throw("bad bits in markallocated");
-       }
-
-       if(!scan) {
-               // BitsDead in the first quadruple means don't scan.
-               if(size == PtrSize)
-                       *b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated+(BitsDead<<2))<<shift);
-               else
-                       ((byte*)b)[shift/8] = bitAllocated+(BitsDead<<2);
-               return;
-       }
-       if(size == PtrSize) {
-               // It's one word and it has pointers, it must be a pointer.
-               *b = (*b & ~((bitBoundary|bitPtrMask)<<shift)) | ((bitAllocated | (BitsPointer<<2))<<shift);
-               return;
-       }
-       ti = te = 0;
-       ptrmask = nil;
-       if(typ != nil && (typ->gc[0]|typ->gc[1]) != 0 && typ->size > PtrSize) {
-               if(typ->kind&KindGCProg) {
-                       nptr = ROUND(typ->size, PtrSize)/PtrSize;
-                       masksize = nptr;
-                       if(masksize%2)
-                               masksize *= 2;  // repeated twice
-                       masksize = masksize*PointersPerByte/8;  // 4 bits per word
-                       masksize++;     // unroll flag in the beginning
-                       if(masksize > MaxGCMask && typ->gc[1] != 0) {
-                               // If the mask is too large, unroll the program directly
-                               // into the GC bitmap. It's 7 times slower than copying
-                               // from the pre-unrolled mask, but saves 1/16 of type size
-                               // memory for the mask.
-                               unrollgcproginplace(v, size, size0, typ);
-                               return;
-                       }
-                       ptrmask = (byte*)typ->gc[0];
-                       // check whether the program is already unrolled
-                       if((runtime·atomicload((uint32*)ptrmask)&0xff) == 0)
-                               unrollgcprog(typ);
-                       ptrmask++;  // skip the unroll flag byte
-               } else
-                       ptrmask = (byte*)&typ->gc[0];  // embed mask
-               if(size == 2*PtrSize) {
-                       ((byte*)b)[shift/8] = ptrmask[0] | bitAllocated;
-                       return;
-               }
-               te = typ->size/PtrSize;
-               // if the type occupies odd number of words, its mask is repeated twice
-               if((te%2) == 0)
-                       te /= 2;
-       }
-       if(size == 2*PtrSize) {
-               ((byte*)b)[shift/8] = (BitsPointer<<2) | (BitsPointer<<6) | bitAllocated;
-               return;
-       }
-       // Copy pointer bitmask into the bitmap.
-       for(i=0; i<size0; i+=2*PtrSize) {
-               x = (BitsPointer<<2) | (BitsPointer<<6);
-               if(ptrmask != nil) {
-                       x = ptrmask[ti++];
-                       if(ti == te)
-                               ti = 0;
-               }
-               off = (uintptr*)((byte*)v + i) - (uintptr*)arena_start;
-               b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
-               if(i == 0)
-                       x |= bitAllocated;
-               if(i+PtrSize == size0)
-                       x &= ~(bitPtrMask<<4);
-               ((byte*)b)[shift/8] = x;
-       }
-       if(size0 == i && size0 < size) {
-               // mark the word after last object's word as BitsDead
-               off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-               b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
-               ((byte*)b)[shift/8] = 0;
-       }
-}
-
-void
-runtime·markallocated_m(void)
-{
-       M *mp;
-
-       mp = g->m;
-       runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
-       mp->ptrarg[0] = nil;
-       mp->ptrarg[1] = nil;
-}
-
 // mark the span of memory at v as having n blocks of the given size.
 // if leftover is true, there is left over space at the end of the span.
 void
index 8a2fc8a97e87a3acf1773c5abce90deb005ac567..fee18f0470b40441a14bcc47123d996431069f87 100644 (file)
@@ -65,7 +65,9 @@ var (
        mprofMalloc_m,
        gc_m,
        setFinalizer_m,
-       markallocated_m mFunction
+       markallocated_m,
+       unrollgcprog_m,
+       unrollgcproginplace_m mFunction
 )
 
 // memclr clears n bytes starting at ptr.
@@ -87,7 +89,12 @@ const (
        concurrentSweep  = true
 )
 
-// in asm_*.s
+// Atomic operations to read/write a pointer.
+// in stubs.goc
+func goatomicloadp(p unsafe.Pointer) unsafe.Pointer     // return *p
+func goatomicstorep(p unsafe.Pointer, v unsafe.Pointer) // *p = v
+
+// in stubs.goc
 // if *p == x { *p = y; return true } else { return false }, atomically
 //go:noescape
 func gocas(p *uint32, x uint32, y uint32) bool
index c64e73de055201b1f1c30aa7d26ddfe767e9f5b1..42a4bf14346b95c49ae0c8a5f7cee1e9f8f3cbb8 100644 (file)
@@ -48,6 +48,16 @@ func gonanotime() (r int64) {
        r = runtime·nanotime();
 }
 
+#pragma textflag NOSPLIT
+func goatomicloadp(p **byte) (v *byte) {
+       v = runtime·atomicloadp(p);
+}
+
+#pragma textflag NOSPLIT
+func goatomicstorep(p **byte, v *byte) {
+       runtime·atomicstorep(p, v);
+}
+
 #pragma textflag NOSPLIT
 func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
        ret = runtime·cas(p, x, y);