OSDN Git Service

runtime: Tweak __go_can_recover for SPARC.
[pf3gnuchains/gcc-fork.git] / libgo / runtime / mgc0.c
index 27fc3cd..73c399d 100644 (file)
@@ -5,13 +5,24 @@
 // Garbage collector.
 
 #include "runtime.h"
+#include "arch.h"
 #include "malloc.h"
 
+#ifdef USING_SPLIT_STACK
+
+extern void * __splitstack_find (void *, void *, size_t *, void **, void **,
+                                void **);
+
+extern void * __splitstack_find_context (void *context[10], size_t *, void **,
+                                        void **, void **);
+
+#endif
+
 enum {
        Debug = 0,
-       UseCas = 1,
        PtrSize = sizeof(void*),
-       
+       DebugMark = 0,  // run second pass to check mark
+
        // Four bits per word (see #defines below).
        wordsPerBitmapWord = sizeof(void*)*8/4,
        bitShift = sizeof(void*)*8/4,
@@ -50,28 +61,68 @@ enum {
 
 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
 
-static uint64 nlookup;
-static uint64 nsizelookup;
-static uint64 naddrlookup;
+// TODO: Make these per-M.
+static uint64 nhandoff;
+
 static int32 gctrace;
 
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
        Workbuf *next;
-       uintptr nw;
-       byte *w[2048-2];
+       uintptr nobj;
+       byte *obj[512-2];
 };
 
-static bool finstarted;
-static pthread_mutex_t finqlock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t finqcond = PTHREAD_COND_INITIALIZER;
-static Finalizer *finq;
+typedef struct Finalizer Finalizer;
+struct Finalizer
+{
+       void (*fn)(void*);
+       void *arg;
+       const struct __go_func_type *ft;
+};
+
+typedef struct FinBlock FinBlock;
+struct FinBlock
+{
+       FinBlock *alllink;
+       FinBlock *next;
+       int32 cnt;
+       int32 cap;
+       Finalizer fin[1];
+};
+
+
+static G *fing;
+static FinBlock *finq; // list of finalizers that are to be executed
+static FinBlock *finc; // cache of free blocks
+static FinBlock *allfin; // list of all blocks
+static Lock finlock;
 static int32 fingwait;
 
 static void runfinq(void*);
 static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
+static void    putempty(Workbuf*);
+static Workbuf* handoff(Workbuf*);
+
+static struct {
+       Lock fmu;
+       Workbuf *full;
+       Lock emu;
+       Workbuf *empty;
+       uint32  nproc;
+       volatile uint32 nwait;
+       volatile uint32 ndone;
+       Note    alldone;
+       Lock    markgate;
+       Lock    sweepgate;
+       MSpan   *spans;
+
+       Lock;
+       byte    *chunk;
+       uintptr nchunk;
+} work;
 
 // scanblock scans a block of n bytes starting at pointer b for references
 // to other objects, scanning any it finds recursively until there are no
@@ -82,21 +133,35 @@ static Workbuf* getfull(Workbuf*);
 static void
 scanblock(byte *b, int64 n)
 {
-       byte *obj, *arena_start, *p;
+       byte *obj, *arena_start, *arena_used, *p;
        void **vp;
-       uintptr size, *bitp, bits, shift, i, j, x, xbits, off;
+       uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc;
        MSpan *s;
        PageID k;
-       void **bw, **w, **ew;
+       void **wp;
        Workbuf *wbuf;
+       bool keepworking;
+
+       if((int64)(uintptr)n != n || n < 0) {
+               // runtime_printf("scanblock %p %lld\n", b, (long long)n);
+               runtime_throw("scanblock");
+       }
 
        // Memory arena parameters.
        arena_start = runtime_mheap.arena_start;
-       
+       arena_used = runtime_mheap.arena_used;
+       nproc = work.nproc;
+
        wbuf = nil;  // current work buffer
-       ew = nil;  // end of work buffer
-       bw = nil;  // beginning of work buffer
-       w = nil;  // current pointer into work buffer
+       wp = nil;  // storage for next queued pointer (write pointer)
+       nobj = 0;  // number of queued objects
+
+       // Scanblock helpers pass b==nil.
+       // The main proc needs to return to make more
+       // calls to scanblock.  But if work.nproc==1 then
+       // might as well process blocks as soon as we
+       // have them.
+       keepworking = b == nil || work.nproc == 1;
 
        // Align b to a word boundary.
        off = (uintptr)b & (PtrSize-1);
@@ -112,17 +177,17 @@ scanblock(byte *b, int64 n)
                        runtime_printf("scanblock %p %lld\n", b, (long long) n);
 
                vp = (void**)b;
-               n /= PtrSize;
+               n >>= (2+PtrSize/8);  /* n /= PtrSize (4 or 8) */
                for(i=0; i<(uintptr)n; i++) {
                        obj = (byte*)vp[i];
-                       
+
                        // Words outside the arena cannot be pointers.
-                       if((byte*)obj < arena_start || (byte*)obj >= runtime_mheap.arena_used)
+                       if((byte*)obj < arena_start || (byte*)obj >= arena_used)
                                continue;
-                       
+
                        // obj may be a pointer to a live object.
                        // Try to find the beginning of the object.
-                       
+
                        // Round down to word boundary.
                        obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
@@ -150,8 +215,6 @@ scanblock(byte *b, int64 n)
 
                        // Otherwise consult span table to find beginning.
                        // (Manually inlined copy of MHeap_LookupMaybe.)
-                       nlookup++;
-                       naddrlookup++;
                        k = (uintptr)obj>>PageShift;
                        x = k;
                        if(sizeof(void*) == 8)
@@ -180,83 +243,67 @@ scanblock(byte *b, int64 n)
                found:
                        // Now we have bits, bitp, and shift correct for
                        // obj pointing at the base of the object.
-                       // If not allocated or already marked, done.
-                       if((bits & bitAllocated) == 0 || (bits & bitMarked) != 0)
+                       // Only care about allocated and not marked.
+                       if((bits & (bitAllocated|bitMarked)) != bitAllocated)
                                continue;
-                       *bitp |= bitMarked<<shift;
+                       if(nproc == 1)
+                               *bitp |= bitMarked<<shift;
+                       else {
+                               for(;;) {
+                                       x = *bitp;
+                                       if(x & (bitMarked<<shift))
+                                               goto continue_obj;
+                                       if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
+                                               break;
+                               }
+                       }
 
                        // If object has no pointers, don't need to scan further.
                        if((bits & bitNoPointers) != 0)
                                continue;
 
+                       // If another proc wants a pointer, give it some.
+                       if(nobj > 4 && work.nwait > 0 && work.full == nil) {
+                               wbuf->nobj = nobj;
+                               wbuf = handoff(wbuf);
+                               nobj = wbuf->nobj;
+                               wp = (void**)(wbuf->obj + nobj);
+                       }
+
                        // If buffer is full, get a new one.
-                       if(w >= ew) {
+                       if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
+                               if(wbuf != nil)
+                                       wbuf->nobj = nobj;
                                wbuf = getempty(wbuf);
-                               bw = (void**)wbuf->w;
-                               w = bw;
-                               ew = bw + nelem(wbuf->w);
+                               wp = (void**)(wbuf->obj);
+                               nobj = 0;
                        }
-                       *w++ = obj;
+                       *wp++ = obj;
+                       nobj++;
+               continue_obj:;
                }
-               
+
                // Done scanning [b, b+n).  Prepare for the next iteration of
                // the loop by setting b and n to the parameters for the next block.
 
-               // Fetch b from the work buffers.
-               if(w <= bw) {
+               // Fetch b from the work buffer.
+               if(nobj == 0) {
+                       if(!keepworking) {
+                               putempty(wbuf);
+                               return;
+                       }
                        // Emptied our buffer: refill.
                        wbuf = getfull(wbuf);
                        if(wbuf == nil)
-                               break;
-                       bw = (void**)wbuf->w;
-                       ew = (void**)(wbuf->w + nelem(wbuf->w));
-                       w = bw+wbuf->nw;
+                               return;
+                       nobj = wbuf->nobj;
+                       wp = (void**)(wbuf->obj + wbuf->nobj);
                }
-               b = *--w;
-       
-               // Figure out n = size of b.  Start by loading bits for b.
-               off = (uintptr*)b - (uintptr*)arena_start;
-               bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = off % wordsPerBitmapWord;
-               xbits = *bitp;
-               bits = xbits >> shift;
-               
-               // Might be small; look for nearby block boundary.
-               // A block boundary is marked by either bitBlockBoundary
-               // or bitAllocated being set (see notes near their definition).
-               enum {
-                       boundary = bitBlockBoundary|bitAllocated
-               };
-               // Look for a block boundary both after and before b
-               // in the same bitmap word.
-               //
-               // A block boundary j words after b is indicated by
-               //      bits>>j & boundary
-               // assuming shift+j < bitShift.  (If shift+j >= bitShift then
-               // we'll be bleeding other bit types like bitMarked into our test.)
-               // Instead of inserting the conditional shift+j < bitShift into the loop,
-               // we can let j range from 1 to bitShift as long as we first
-               // apply a mask to keep only the bits corresponding
-               // to shift+j < bitShift aka j < bitShift-shift.
-               bits &= (boundary<<(bitShift-shift)) - boundary;
-               
-               // A block boundary j words before b is indicated by
-               //      xbits>>(shift-j) & boundary
-               // (assuming shift >= j).  There is no cleverness here
-               // avoid the test, because when j gets too large the shift
-               // turns negative, which is undefined in C.             
-
-               for(j=1; j<bitShift; j++) {
-                       if(((bits>>j)&boundary) != 0 || (shift>=j && ((xbits>>(shift-j))&boundary) != 0)) {
-                               n = j*PtrSize;
-                               goto scan;
-                       }
-               }
-               
-               // Fall back to asking span about size class.
+               b = *--wp;
+               nobj--;
+
+               // Ask span about size class.
                // (Manually inlined copy of MHeap_Lookup.)
-               nlookup++;
-               nsizelookup++;
                x = (uintptr)b>>PageShift;
                if(sizeof(void*) == 8)
                        x -= (uintptr)arena_start>>PageShift;
@@ -265,33 +312,126 @@ scanblock(byte *b, int64 n)
                        n = s->npages<<PageShift;
                else
                        n = runtime_class_to_size[s->sizeclass];
-       scan:;
        }
 }
 
-static struct {
-       Workbuf *full;
-       Workbuf *empty;
-       byte    *chunk;
-       uintptr nchunk;
-} work;
+// debug_scanblock is the debug copy of scanblock.
+// it is simpler, slower, single-threaded, recursive,
+// and uses bitSpecial as the mark bit.
+static void
+debug_scanblock(byte *b, int64 n)
+{
+       byte *obj, *p;
+       void **vp;
+       uintptr size, *bitp, bits, shift, i, xbits, off;
+       MSpan *s;
+
+       if(!DebugMark)
+               runtime_throw("debug_scanblock without DebugMark");
+
+       if((int64)(uintptr)n != n || n < 0) {
+               //runtime_printf("debug_scanblock %p %D\n", b, n);
+               runtime_throw("debug_scanblock");
+       }
+
+       // Align b to a word boundary.
+       off = (uintptr)b & (PtrSize-1);
+       if(off != 0) {
+               b += PtrSize - off;
+               n -= PtrSize - off;
+       }
+
+       vp = (void**)b;
+       n /= PtrSize;
+       for(i=0; i<(uintptr)n; i++) {
+               obj = (byte*)vp[i];
+
+               // Words outside the arena cannot be pointers.
+               if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
+                       continue;
+
+               // Round down to word boundary.
+               obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+
+               // Consult span table to find beginning.
+               s = runtime_MHeap_LookupMaybe(&runtime_mheap, obj);
+               if(s == nil)
+                       continue;
+
+
+               p =  (byte*)((uintptr)s->start<<PageShift);
+               if(s->sizeclass == 0) {
+                       obj = p;
+                       size = (uintptr)s->npages<<PageShift;
+               } else {
+                       if((byte*)obj >= (byte*)s->limit)
+                               continue;
+                       size = runtime_class_to_size[s->sizeclass];
+                       int32 i = ((byte*)obj - p)/size;
+                       obj = p+i*size;
+               }
+
+               // Now that we know the object header, reload bits.
+               off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
+               bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+               shift = off % wordsPerBitmapWord;
+               xbits = *bitp;
+               bits = xbits >> shift;
+
+               // Now we have bits, bitp, and shift correct for
+               // obj pointing at the base of the object.
+               // If not allocated or already marked, done.
+               if((bits & bitAllocated) == 0 || (bits & bitSpecial) != 0)  // NOTE: bitSpecial not bitMarked
+                       continue;
+               *bitp |= bitSpecial<<shift;
+               if(!(bits & bitMarked))
+                       runtime_printf("found unmarked block %p in %p\n", obj, vp+i);
+
+               // If object has no pointers, don't need to scan further.
+               if((bits & bitNoPointers) != 0)
+                       continue;
+
+               debug_scanblock(obj, size);
+       }
+}
 
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 static Workbuf*
 getempty(Workbuf *b)
 {
-       if(b != nil) {
-               b->nw = nelem(b->w);
-               b->next = work.full;
-               work.full = b;
-       }
-       b = work.empty;
-       if(b != nil) {
-               work.empty = b->next;
-               return b;
+       if(work.nproc == 1) {
+               // Put b on full list.
+               if(b != nil) {
+                       b->next = work.full;
+                       work.full = b;
+               }
+               // Grab from empty list if possible.
+               b = work.empty;
+               if(b != nil) {
+                       work.empty = b->next;
+                       goto haveb;
+               }
+       } else {
+               // Put b on full list.
+               if(b != nil) {
+                       runtime_lock(&work.fmu);
+                       b->next = work.full;
+                       work.full = b;
+                       runtime_unlock(&work.fmu);
+               }
+               // Grab from empty list if possible.
+               runtime_lock(&work.emu);
+               b = work.empty;
+               if(b != nil)
+                       work.empty = b->next;
+               runtime_unlock(&work.emu);
+               if(b != nil)
+                       goto haveb;
        }
-       
+
+       // Need to allocate.
+       runtime_lock(&work);
        if(work.nchunk < sizeof *b) {
                work.nchunk = 1<<20;
                work.chunk = runtime_SysAlloc(work.nchunk);
@@ -299,26 +439,195 @@ getempty(Workbuf *b)
        b = (Workbuf*)work.chunk;
        work.chunk += sizeof *b;
        work.nchunk -= sizeof *b;
+       runtime_unlock(&work);
+
+haveb:
+       b->nobj = 0;
        return b;
 }
 
+static void
+putempty(Workbuf *b)
+{
+       if(b == nil)
+               return;
+
+       if(work.nproc == 1) {
+               b->next = work.empty;
+               work.empty = b;
+               return;
+       }
+
+       runtime_lock(&work.emu);
+       b->next = work.empty;
+       work.empty = b;
+       runtime_unlock(&work.emu);
+}
+
 // Get a full work buffer off the work.full list, or return nil.
 static Workbuf*
 getfull(Workbuf *b)
 {
-       if(b != nil) {
-               b->nw = 0;
-               b->next = work.empty;
-               work.empty = b;
+       int32 i;
+       Workbuf *b1;
+
+       if(work.nproc == 1) {
+               // Put b on empty list.
+               if(b != nil) {
+                       b->next = work.empty;
+                       work.empty = b;
+               }
+               // Grab from full list if possible.
+               // Since work.nproc==1, no one else is
+               // going to give us work.
+               b = work.full;
+               if(b != nil)
+                       work.full = b->next;
+               return b;
        }
-       b = work.full;
-       if(b != nil)
-               work.full = b->next;
-       return b;
+
+       putempty(b);
+
+       // Grab buffer from full list if possible.
+       for(;;) {
+               b1 = work.full;
+               if(b1 == nil)
+                       break;
+               runtime_lock(&work.fmu);
+               if(work.full != nil) {
+                       b1 = work.full;
+                       work.full = b1->next;
+                       runtime_unlock(&work.fmu);
+                       return b1;
+               }
+               runtime_unlock(&work.fmu);
+       }
+
+       runtime_xadd(&work.nwait, +1);
+       for(i=0;; i++) {
+               b1 = work.full;
+               if(b1 != nil) {
+                       runtime_lock(&work.fmu);
+                       if(work.full != nil) {
+                               runtime_xadd(&work.nwait, -1);
+                               b1 = work.full;
+                               work.full = b1->next;
+                               runtime_unlock(&work.fmu);
+                               return b1;
+                       }
+                       runtime_unlock(&work.fmu);
+                       continue;
+               }
+               if(work.nwait == work.nproc)
+                       return nil;
+               if(i < 10)
+                       runtime_procyield(20);
+               else if(i < 20)
+                       runtime_osyield();
+               else
+                       runtime_usleep(100);
+       }
+}
+
+static Workbuf*
+handoff(Workbuf *b)
+{
+       int32 n;
+       Workbuf *b1;
+
+       // Make new buffer with half of b's pointers.
+       b1 = getempty(nil);
+       n = b->nobj/2;
+       b->nobj -= n;
+       b1->nobj = n;
+       runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
+       nhandoff += n;
+
+       // Put b on full list - let first half of b get stolen.
+       runtime_lock(&work.fmu);
+       b->next = work.full;
+       work.full = b;
+       runtime_unlock(&work.fmu);
+
+       return b1;
 }
 
 // Scanstack calls scanblock on each of gp's stack segments.
 static void
+scanstack(void (*scanblock)(byte*, int64), G *gp)
+{
+#ifdef USING_SPLIT_STACK
+       M *mp;
+       void* sp;
+       size_t spsize;
+       void* next_segment;
+       void* next_sp;
+       void* initial_sp;
+
+       if(gp == runtime_g()) {
+               // Scanning our own stack.
+               sp = __splitstack_find(nil, nil, &spsize, &next_segment,
+                                      &next_sp, &initial_sp);
+       } else if((mp = gp->m) != nil && mp->helpgc) {
+               // gchelper's stack is in active use and has no interesting pointers.
+               return;
+       } else {
+               // Scanning another goroutine's stack.
+               // The goroutine is usually asleep (the world is stopped).
+
+               // The exception is that if the goroutine is about to enter or might
+               // have just exited a system call, it may be executing code such
+               // as schedlock and may have needed to start a new stack segment.
+               // Use the stack segment and stack pointer at the time of
+               // the system call instead, since that won't change underfoot.
+               if(gp->gcstack != nil) {
+                       sp = gp->gcstack;
+                       spsize = gp->gcstack_size;
+                       next_segment = gp->gcnext_segment;
+                       next_sp = gp->gcnext_sp;
+                       initial_sp = gp->gcinitial_sp;
+               } else {
+                       sp = __splitstack_find_context(&gp->stack_context[0],
+                                                      &spsize, &next_segment,
+                                                      &next_sp, &initial_sp);
+               }
+       }
+       if(sp != nil) {
+               scanblock(sp, spsize);
+               while((sp = __splitstack_find(next_segment, next_sp,
+                                             &spsize, &next_segment,
+                                             &next_sp, &initial_sp)) != nil)
+                       scanblock(sp, spsize);
+       }
+#else
+       M *mp;
+       byte* bottom;
+       byte* top;
+
+       if(gp == runtime_g()) {
+               // Scanning our own stack.
+               bottom = (byte*)&gp;
+       } else if((mp = gp->m) != nil && mp->helpgc) {
+               // gchelper's stack is in active use and has no interesting pointers.
+               return;
+       } else {
+               // Scanning another goroutine's stack.
+               // The goroutine is usually asleep (the world is stopped).
+               bottom = (byte*)gp->gcnext_sp;
+               if(bottom == nil)
+                       return;
+       }
+       top = (byte*)gp->gcinitial_sp + gp->gcstack_size;
+       if(top > bottom)
+               scanblock(bottom, top - bottom);
+       else
+               scanblock(top, bottom - top);
+#endif
+}
+
+// Markfin calls scanblock on the blocks that have finalizers:
+// the things pointed at cannot be freed until the finalizers have run.
+static void
 markfin(void *v)
 {
        uintptr size;
@@ -350,12 +659,25 @@ __go_register_gc_roots (struct root_list* r)
        roots = r;
 }
 
-// Mark 
 static void
-mark(void)
+debug_markfin(void *v)
+{
+       uintptr size;
+
+       if(!runtime_mlookup(v, (byte**)&v, &size, nil))
+               runtime_throw("debug_mark - finalizer inconsistency");
+       debug_scanblock(v, size);
+}
+
+// Mark
+static void
+mark(void (*scan)(byte*, int64))
 {
        struct root_list *pl;
+       G *gp;
+       FinBlock *fb;
 
+       // mark data+bss.
        for(pl = roots; pl != nil; pl = pl->next) {
                struct root* pr = &pl->roots[0];
                while(1) {
@@ -367,30 +689,106 @@ mark(void)
                }
        }
 
-       scanblock((byte*)&m0, sizeof m0);
-       scanblock((byte*)&finq, sizeof finq);
-       runtime_MProf_Mark(scanblock);
+       scan((byte*)&runtime_m0, sizeof runtime_m0);
+       scan((byte*)&runtime_g0, sizeof runtime_g0);
+       scan((byte*)&runtime_allg, sizeof runtime_allg);
+       scan((byte*)&runtime_allm, sizeof runtime_allm);
+       runtime_MProf_Mark(scan);
+       runtime_time_scan(scan);
 
        // mark stacks
-       __go_scanstacks(scanblock);
+       for(gp=runtime_allg; gp!=nil; gp=gp->alllink) {
+               switch(gp->status){
+               default:
+                       runtime_printf("unexpected G.status %d\n", gp->status);
+                       runtime_throw("mark - bad status");
+               case Gdead:
+                       break;
+               case Grunning:
+                       if(gp != runtime_g())
+                               runtime_throw("mark - world not stopped");
+                       scanstack(scan, gp);
+                       break;
+               case Grunnable:
+               case Gsyscall:
+               case Gwaiting:
+                       scanstack(scan, gp);
+                       break;
+               }
+       }
 
        // mark things pointed at by objects with finalizers
-       runtime_walkfintab(markfin, scanblock);
+       if(scan == debug_scanblock)
+               runtime_walkfintab(debug_markfin, scan);
+       else
+               runtime_walkfintab(markfin, scan);
+
+       for(fb=allfin; fb; fb=fb->alllink)
+               scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
+
+       // in multiproc mode, join in the queued work.
+       scan(nil, 0);
 }
 
-// Sweep frees or calls finalizers for blocks not marked in the mark phase.
+static bool
+handlespecial(byte *p, uintptr size)
+{
+       void (*fn)(void*);
+       const struct __go_func_type *ft;
+       FinBlock *block;
+       Finalizer *f;
+       
+       if(!runtime_getfinalizer(p, true, &fn, &ft)) {
+               runtime_setblockspecial(p, false);
+               runtime_MProf_Free(p, size);
+               return false;
+       }
+
+       runtime_lock(&finlock);
+       if(finq == nil || finq->cnt == finq->cap) {
+               if(finc == nil) {
+                       finc = runtime_SysAlloc(PageSize);
+                       finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
+                       finc->alllink = allfin;
+                       allfin = finc;
+               }
+               block = finc;
+               finc = block->next;
+               block->next = finq;
+               finq = block;
+       }
+       f = &finq->fin[finq->cnt];
+       finq->cnt++;
+       f->fn = fn;
+       f->ft = ft;
+       f->arg = p;
+       runtime_unlock(&finlock); 
+       return true;
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
 sweep(void)
 {
+       M *m;
        MSpan *s;
        int32 cl, n, npages;
        uintptr size;
        byte *p;
        MCache *c;
-       Finalizer *f;
+       byte *arena_start;
+
+       m = runtime_m();
+       arena_start = runtime_mheap.arena_start;
+
+       for(;;) {
+               s = work.spans;
+               if(s == nil)
+                       break;
+               if(!runtime_casp(&work.spans, s, s->allnext))
+                       continue;
 
-       for(s = runtime_mheap.allspans; s != nil; s = s->allnext) {
                if(s->state != MSpanInUse)
                        continue;
 
@@ -405,13 +803,15 @@ sweep(void)
                        npages = runtime_class_to_allocnpages[cl];
                        n = (npages << PageShift) / size;
                }
-       
-               // sweep through n objects of given size starting at p.
+
+               // Sweep through n objects of given size starting at p.
+               // This thread owns the span now, so it can manipulate
+               // the block bitmap without atomic operations.
                for(; n > 0; n--, p += size) {
                        uintptr off, *bitp, shift, bits;
 
-                       off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;
-                       bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+                       off = (uintptr*)p - (uintptr*)arena_start;
+                       bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
                        shift = off % wordsPerBitmapWord;
                        bits = *bitp>>shift;
 
@@ -419,25 +819,27 @@ sweep(void)
                                continue;
 
                        if((bits & bitMarked) != 0) {
+                               if(DebugMark) {
+                                       if(!(bits & bitSpecial))
+                                               runtime_printf("found spurious mark on %p\n", p);
+                                       *bitp &= ~(bitSpecial<<shift);
+                               }
                                *bitp &= ~(bitMarked<<shift);
                                continue;
                        }
 
-                       if((bits & bitSpecial) != 0) {
-                               // Special means it has a finalizer or is being profiled.
-                               f = runtime_getfinalizer(p, 1);
-                               if(f != nil) {
-                                       f->arg = p;
-                                       f->next = finq;
-                                       finq = f;
+                       // Special means it has a finalizer or is being profiled.
+                       // In DebugMark mode, the bit has been coopted so
+                       // we have to assume all blocks are special.
+                       if(DebugMark || (bits & bitSpecial) != 0) {
+                               if(handlespecial(p, size))
                                        continue;
-                               }
-                               runtime_MProf_Free(p, size);
                        }
 
                        // Mark freed; restore block boundary bit.
                        *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 
+                       c = m->mcache;
                        if(s->sizeclass == 0) {
                                // Free large span.
                                runtime_unmarkspan(p, 1<<PageShift);
@@ -445,19 +847,38 @@ sweep(void)
                                runtime_MHeap_Free(&runtime_mheap, s, 1);
                        } else {
                                // Free small object.
-                               c = m->mcache;
                                if(size > sizeof(uintptr))
                                        ((uintptr*)p)[1] = 1;   // mark as "needs to be zeroed"
-                               mstats.by_size[s->sizeclass].nfree++;
+                               c->local_by_size[s->sizeclass].nfree++;
                                runtime_MCache_Free(c, p, s->sizeclass, size);
                        }
-                       mstats.alloc -= size;
-                       mstats.nfree++;
+                       c->local_alloc -= size;
+                       c->local_nfree++;
                }
        }
 }
 
-static pthread_mutex_t gcsema = PTHREAD_MUTEX_INITIALIZER;
+void
+runtime_gchelper(void)
+{
+       // Wait until main proc is ready for mark help.
+       runtime_lock(&work.markgate);
+       runtime_unlock(&work.markgate);
+       scanblock(nil, 0);
+
+       // Wait until main proc is ready for sweep help.
+       runtime_lock(&work.sweepgate);
+       runtime_unlock(&work.sweepgate);
+       sweep();
+
+       if(runtime_xadd(&work.ndone, +1) == work.nproc-1)
+               runtime_notewakeup(&work.alldone);
+}
+
+// Semaphore, not Lock, so that the goroutine
+// reschedules when there is contention rather
+// than spinning.
+static uint32 gcsema = 1;
 
 // Initialized from $GOGC.  GOGC=off means no gc.
 //
@@ -470,13 +891,54 @@ static pthread_mutex_t gcsema = PTHREAD_MUTEX_INITIALIZER;
 // extra memory used).
 static int32 gcpercent = -2;
 
+static void
+stealcache(void)
+{
+       M *m;
+
+       for(m=runtime_allm; m; m=m->alllink)
+               runtime_MCache_ReleaseAll(m->mcache);
+}
+
+static void
+cachestats(void)
+{
+       M *m;
+       MCache *c;
+       uint32 i;
+       uint64 stacks_inuse;
+       uint64 stacks_sys;
+
+       stacks_inuse = 0;
+       stacks_sys = 0;
+       for(m=runtime_allm; m; m=m->alllink) {
+               runtime_purgecachedstats(m);
+               // stacks_inuse += m->stackalloc->inuse;
+               // stacks_sys += m->stackalloc->sys;
+               c = m->mcache;
+               for(i=0; i<nelem(c->local_by_size); i++) {
+                       mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
+                       c->local_by_size[i].nmalloc = 0;
+                       mstats.by_size[i].nfree += c->local_by_size[i].nfree;
+                       c->local_by_size[i].nfree = 0;
+               }
+       }
+       mstats.stacks_inuse = stacks_inuse;
+       mstats.stacks_sys = stacks_sys;
+}
+
 void
-runtime_gc(int32 force __attribute__ ((unused)))
+runtime_gc(int32 force)
 {
+       M *m;
        int64 t0, t1, t2, t3;
        uint64 heap0, heap1, obj0, obj1;
-       char *p;
-       Finalizer *fp;
+       const byte *p;
+       bool extra;
+
+       // Make sure all registers are saved on stack so that
+       // scanstack sees them.
+       __builtin_unwind_init();
 
        // The gc is turned off (via enablegc) until
        // the bootstrap has completed.
@@ -486,18 +948,19 @@ runtime_gc(int32 force __attribute__ ((unused)))
        // problems, don't bother trying to run gc
        // while holding a lock.  The next mallocgc
        // without a lock will do the gc instead.
-       if(!mstats.enablegc || m->locks > 0 /* || runtime_panicking */)
+       m = runtime_m();
+       if(!mstats.enablegc || m->locks > 0 || runtime_panicking)
                return;
 
        if(gcpercent == -2) {   // first time through
                p = runtime_getenv("GOGC");
                if(p == nil || p[0] == '\0')
                        gcpercent = 100;
-               else if(runtime_strcmp(p, "off") == 0)
+               else if(runtime_strcmp((const char*)p, "off") == 0)
                        gcpercent = -1;
                else
                        gcpercent = runtime_atoi(p);
-               
+
                p = runtime_getenv("GOGCTRACE");
                if(p != nil)
                        gctrace = runtime_atoi(p);
@@ -505,39 +968,66 @@ runtime_gc(int32 force __attribute__ ((unused)))
        if(gcpercent < 0)
                return;
 
-       pthread_mutex_lock(&finqlock);
-       pthread_mutex_lock(&gcsema);
+       runtime_semacquire(&gcsema);
        if(!force && mstats.heap_alloc < mstats.next_gc) {
-               pthread_mutex_unlock(&gcsema);
-               pthread_mutex_unlock(&finqlock);
+               runtime_semrelease(&gcsema);
                return;
        }
 
        t0 = runtime_nanotime();
-       nlookup = 0;
-       nsizelookup = 0;
-       naddrlookup = 0;
+       nhandoff = 0;
 
        m->gcing = 1;
        runtime_stoptheworld();
-       if(runtime_mheap.Lock.key != 0)
-               runtime_throw("runtime_mheap locked during gc");
 
-       __go_cachestats();
+       cachestats();
        heap0 = mstats.heap_alloc;
        obj0 = mstats.nmalloc - mstats.nfree;
 
-       mark();
+       runtime_lock(&work.markgate);
+       runtime_lock(&work.sweepgate);
+
+       extra = false;
+       work.nproc = 1;
+       if(runtime_gomaxprocs > 1 && runtime_ncpu > 1) {
+               runtime_noteclear(&work.alldone);
+               work.nproc += runtime_helpgc(&extra);
+       }
+       work.nwait = 0;
+       work.ndone = 0;
+
+       runtime_unlock(&work.markgate);  // let the helpers in
+       mark(scanblock);
+       if(DebugMark)
+               mark(debug_scanblock);
        t1 = runtime_nanotime();
+
+       work.spans = runtime_mheap.allspans;
+       runtime_unlock(&work.sweepgate);  // let the helpers in
        sweep();
+       if(work.nproc > 1)
+               runtime_notesleep(&work.alldone);
        t2 = runtime_nanotime();
-       __go_stealcache();
+
+       stealcache();
+       cachestats();
 
        mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
        m->gcing = 0;
 
        m->locks++;     // disable gc during the mallocs in newproc
+       if(finq != nil) {
+               // kick off or wake up goroutine to run queued finalizers
+               if(fing == nil)
+                       fing = __go_go(runfinq, nil);
+               else if(fingwait) {
+                       fingwait = 0;
+                       runtime_ready(fing);
+               }
+       }
+       m->locks--;
 
+       cachestats();
        heap1 = mstats.heap_alloc;
        obj1 = mstats.nmalloc - mstats.nfree;
 
@@ -547,73 +1037,102 @@ runtime_gc(int32 force __attribute__ ((unused)))
        mstats.numgc++;
        if(mstats.debuggc)
                runtime_printf("pause %llu\n", (unsigned long long)t3-t0);
-       
+
        if(gctrace) {
-               runtime_printf("gc%d: %llu+%llu+%llu ms %llu -> %llu MB %llu -> %llu (%llu-%llu) objects %llu pointer lookups (%llu size, %llu addr)\n",
-                       mstats.numgc, (unsigned long long)(t1-t0)/1000000, (unsigned long long)(t2-t1)/1000000, (unsigned long long)(t3-t2)/1000000,
+               runtime_printf("gc%d(%d): %llu+%llu+%llu ms %llu -> %llu MB %llu -> %llu (%llu-%llu) objects %llu handoff\n",
+                       mstats.numgc, work.nproc, (unsigned long long)(t1-t0)/1000000, (unsigned long long)(t2-t1)/1000000, (unsigned long long)(t3-t2)/1000000,
                        (unsigned long long)heap0>>20, (unsigned long long)heap1>>20, (unsigned long long)obj0, (unsigned long long)obj1,
-                       (unsigned long long)mstats.nmalloc, (unsigned long long)mstats.nfree,
-                       (unsigned long long)nlookup, (unsigned long long)nsizelookup, (unsigned long long)naddrlookup);
+                       (unsigned long long) mstats.nmalloc, (unsigned long long)mstats.nfree,
+                       (unsigned long long) nhandoff);
        }
 
-       pthread_mutex_unlock(&gcsema);
-       runtime_starttheworld();
+       runtime_semrelease(&gcsema);
 
-       // finqlock is still held.
-       fp = finq;
-       if(fp != nil) {
-               // kick off or wake up goroutine to run queued finalizers
-               if(!finstarted) {
-                       __go_go(runfinq, nil);
-                       finstarted = 1;
-               }
-               else if(fingwait) {
-                       fingwait = 0;
-                       pthread_cond_signal(&finqcond);
-               }
-       }
-       m->locks--;
-       pthread_mutex_unlock(&finqlock);
+       // If we could have used another helper proc, start one now,
+       // in the hope that it will be available next time.
+       // It would have been even better to start it before the collection,
+       // but doing so requires allocating memory, so it's tricky to
+       // coordinate.  This lazy approach works out in practice:
+       // we don't mind if the first couple gc rounds don't have quite
+       // the maximum number of procs.
+       runtime_starttheworld(extra);
+
+       // give the queued finalizers, if any, a chance to run  
+       if(finq != nil) 
+               runtime_gosched();
 
        if(gctrace > 1 && !force)
                runtime_gc(1);
 }
 
-static void
-runfinq(void* dummy)
+void runtime_ReadMemStats(MStats *)
+  __asm__("libgo_runtime.runtime.ReadMemStats");
+
+void
+runtime_ReadMemStats(MStats *stats)
 {
-       Finalizer *f, *next;
+       M *m;
+
+       // Have to acquire gcsema to stop the world,
+       // because stoptheworld can only be used by
+       // one goroutine at a time, and there might be
+       // a pending garbage collection already calling it.
+       runtime_semacquire(&gcsema);
+       m = runtime_m();
+       m->gcing = 1;
+       runtime_stoptheworld();
+       cachestats();
+       *stats = mstats;
+       m->gcing = 0;
+       runtime_semrelease(&gcsema);
+       runtime_starttheworld(false);
+}
 
-       USED(dummy);
+static void
+runfinq(void* dummy __attribute__ ((unused)))
+{
+       G* gp;
+       Finalizer *f;
+       FinBlock *fb, *next;
+       uint32 i;
 
+       gp = runtime_g();
        for(;;) {
-               pthread_mutex_lock(&finqlock);
-               f = finq;
+               // There's no need for a lock in this section
+               // because it only conflicts with the garbage
+               // collector, and the garbage collector only
+               // runs when everyone else is stopped, and
+               // runfinq only stops at the gosched() or
+               // during the calls in the for loop.
+               fb = finq;
                finq = nil;
-               if(f == nil) {
+               if(fb == nil) {
                        fingwait = 1;
-                       pthread_cond_wait(&finqcond, &finqlock);
-                       pthread_mutex_unlock(&finqlock);
+                       gp->status = Gwaiting;
+                       gp->waitreason = "finalizer wait";
+                       runtime_gosched();
                        continue;
                }
-               pthread_mutex_unlock(&finqlock);
-               for(; f; f=next) {
-                       void *params[1];
-
-                       next = f->next;
-                       params[0] = &f->arg;
-                       reflect_call(f->ft, (void*)f->fn, 0, params, nil);
-                       f->fn = nil;
-                       f->arg = nil;
-                       f->next = nil;
-                       runtime_free(f);
+               for(; fb; fb=next) {
+                       next = fb->next;
+                       for(i=0; i<(uint32)fb->cnt; i++) {
+                               void *params[1];
+
+                               f = &fb->fin[i];
+                               params[0] = &f->arg;
+                               runtime_setblockspecial(f->arg, false);
+                               reflect_call(f->ft, (void*)f->fn, 0, 0, params, nil);
+                               f->fn = nil;
+                               f->arg = nil;
+                       }
+                       fb->cnt = 0;
+                       fb->next = finc;
+                       finc = fb;
                }
                runtime_gc(1);  // trigger another gc to clean up the finalized objects, if possible
        }
 }
 
-#define runtime_gomaxprocs 2
-
 // mark the block at v of size n as allocated.
 // If noptr is true, mark it as having no pointers.
 void
@@ -636,11 +1155,11 @@ runtime_markallocated(void *v, uintptr n, bool noptr)
                bits = (obits & ~(bitMask<<shift)) | (bitAllocated<<shift);
                if(noptr)
                        bits |= bitNoPointers<<shift;
-               if(runtime_gomaxprocs == 1) {
+               if(runtime_singleproc) {
                        *b = bits;
                        break;
                } else {
-                       // gomaxprocs > 1: use atomic op
+                       // more than one goroutine is potentially running: use atomic op
                        if(runtime_casp((void**)b, (void*)obits, (void*)bits))
                                break;
                }
@@ -666,11 +1185,11 @@ runtime_markfreed(void *v, uintptr n)
        for(;;) {
                obits = *b;
                bits = (obits & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
-               if(runtime_gomaxprocs == 1) {
+               if(runtime_singleproc) {
                        *b = bits;
                        break;
                } else {
-                       // gomaxprocs > 1: use atomic op
+                       // more than one goroutine is potentially running: use atomic op
                        if(runtime_casp((void**)b, (void*)obits, (void*)bits))
                                break;
                }
@@ -758,6 +1277,9 @@ runtime_blockspecial(void *v)
 {
        uintptr *b, off, shift;
 
+       if(DebugMark)
+               return true;
+
        off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
        b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
        shift = off % wordsPerBitmapWord;
@@ -766,28 +1288,34 @@ runtime_blockspecial(void *v)
 }
 
 void
-runtime_setblockspecial(void *v)
+runtime_setblockspecial(void *v, bool s)
 {
        uintptr *b, off, shift, bits, obits;
 
+       if(DebugMark)
+               return;
+
        off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
        b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
        shift = off % wordsPerBitmapWord;
 
        for(;;) {
                obits = *b;
-               bits = obits | (bitSpecial<<shift);
-               if(runtime_gomaxprocs == 1) {
+               if(s)
+                       bits = obits | (bitSpecial<<shift);
+               else
+                       bits = obits & ~(bitSpecial<<shift);
+               if(runtime_singleproc) {
                        *b = bits;
                        break;
                } else {
-                       // gomaxprocs > 1: use atomic op
+                       // more than one goroutine is potentially running: use atomic op
                        if(runtime_casp((void**)b, (void*)obits, (void*)bits))
                                break;
                }
        }
 }
+
 void
 runtime_MHeap_MapBits(MHeap *h)
 {
@@ -798,7 +1326,7 @@ runtime_MHeap_MapBits(MHeap *h)
                bitmapChunk = 8192
        };
        uintptr n;
-       
+
        n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
        n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
        if(h->bitmap_mapped >= n)
@@ -807,9 +1335,3 @@ runtime_MHeap_MapBits(MHeap *h)
        runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped);
        h->bitmap_mapped = n;
 }
-
-void
-__go_enable_gc()
-{
-  mstats.enablegc = 1;
-}