From: Dmitriy Vyukov Date: Tue, 13 Aug 2013 18:07:42 +0000 (+0400) Subject: runtime: fix false deadlock crash X-Git-Tag: go1.2rc2~619 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=1da1030b5dd9d9610ccada4413a23e77b21c7f3b;p=gostls13.git runtime: fix false deadlock crash Fixes #6070. Update #6055. R=golang-dev, nightlyone, rsc CC=golang-dev https://golang.org/cl/12602043 --- diff --git a/src/pkg/net/tcp_test.go b/src/pkg/net/tcp_test.go index cad76b460e..5b3d8353cf 100644 --- a/src/pkg/net/tcp_test.go +++ b/src/pkg/net/tcp_test.go @@ -494,3 +494,78 @@ func TestTCPReadWriteMallocs(t *testing.T) { t.Fatalf("Got %v allocs, want %v", mallocs, maxMallocs) } } + +func TestTCPStress(t *testing.T) { + const conns = 2 + const msgs = 1e4 + const msgLen = 512 + + sendMsg := func(c Conn, buf []byte) bool { + n, err := c.Write(buf) + if n != len(buf) || err != nil { + t.Logf("Write failed: %v", err) + return false + } + return true + } + recvMsg := func(c Conn, buf []byte) bool { + for read := 0; read != len(buf); { + n, err := c.Read(buf) + read += n + if err != nil { + t.Logf("Read failed: %v", err) + return false + } + } + return true + } + + ln, err := Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("Listen failed: %v", err) + } + defer ln.Close() + // Acceptor. + go func() { + for { + c, err := ln.Accept() + if err != nil { + break + } + // Server connection. + go func(c Conn) { + defer c.Close() + var buf [msgLen]byte + for m := 0; m < msgs; m++ { + if !recvMsg(c, buf[:]) || !sendMsg(c, buf[:]) { + break + } + } + }(c) + } + }() + done := make(chan bool) + for i := 0; i < conns; i++ { + // Client connection. + go func() { + defer func() { + done <- true + }() + c, err := Dial("tcp", ln.Addr().String()) + if err != nil { + t.Logf("Dial failed: %v", err) + return + } + defer c.Close() + var buf [msgLen]byte + for m := 0; m < msgs; m++ { + if !sendMsg(c, buf[:]) || !recvMsg(c, buf[:]) { + break + } + } + }() + } + for i := 0; i < conns; i++ { + <-done + } +} diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index 5574f0d6dc..ef4d27f314 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -30,7 +30,7 @@ struct Sched { M* midle; // idle m's waiting for work int32 nmidle; // number of idle m's waiting for work - int32 mlocked; // number of locked m's waiting for work + int32 nmidlelocked; // number of locked m's waiting for work int32 mcount; // number of m's that have been created P* pidle; // idle P's @@ -95,7 +95,7 @@ static void stoplockedm(void); static void startlockedm(G*); static void sysmon(void); static uint32 retake(int64); -static void inclocked(int32); +static void incidlelocked(int32); static void checkdead(void); static void exitsyscall0(G*); static void park0(G*); @@ -1019,7 +1019,7 @@ stoplockedm(void) p = releasep(); handoffp(p); } - inclocked(1); + incidlelocked(1); // Wait until another thread schedules lockedg again. runtime·notesleep(&m->park); runtime·noteclear(&m->park); @@ -1042,7 +1042,7 @@ startlockedm(G *gp) if(mp->nextp) runtime·throw("startlockedm: m has p"); // directly handoff current P to the locked m - inclocked(-1); + incidlelocked(-1); p = releasep(); mp->nextp = p; runtime·notewakeup(&mp->park); @@ -1485,7 +1485,7 @@ void p = releasep(); handoffp(p); if(g->isbackground) // do not consider blocked scavenger for deadlock detection - inclocked(1); + incidlelocked(1); // Resave for traceback during blocked call. save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy)); @@ -1505,7 +1505,7 @@ runtime·exitsyscall(void) m->locks++; // see comment in entersyscall if(g->isbackground) // do not consider blocked scavenger for deadlock detection - inclocked(-1); + incidlelocked(-1); if(exitsyscallfast()) { // There's a cpu for us, so we can run. @@ -2159,10 +2159,10 @@ releasep(void) } static void -inclocked(int32 v) +incidlelocked(int32 v) { runtime·lock(&runtime·sched); - runtime·sched.mlocked += v; + runtime·sched.nmidlelocked += v; if(v > 0) checkdead(); runtime·unlock(&runtime·sched); @@ -2177,12 +2177,12 @@ checkdead(void) int32 run, grunning, s; // -1 for sysmon - run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.mlocked - 1; + run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.nmidlelocked - 1; if(run > 0) return; if(run < 0) { - runtime·printf("checkdead: nmidle=%d mlocked=%d mcount=%d\n", - runtime·sched.nmidle, runtime·sched.mlocked, runtime·sched.mcount); + runtime·printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n", + runtime·sched.nmidle, runtime·sched.nmidlelocked, runtime·sched.mcount); runtime·throw("checkdead: inconsistent counts"); } grunning = 0; @@ -2238,7 +2238,18 @@ sysmon(void) if(lastpoll != 0 && lastpoll + 10*1000*1000 > now) { runtime·cas64(&runtime·sched.lastpoll, lastpoll, now); gp = runtime·netpoll(false); // non-blocking - injectglist(gp); + if(gp) { + // Need to decrement number of idle locked M's + // (pretending that one more is running) before injectglist. + // Otherwise it can lead to the following situation: + // injectglist grabs all P's but before it starts M's to run the P's, + // another M returns from syscall, finishes running its G, + // observes that there is no work to do and no other running M's + // and reports deadlock. + incidlelocked(-1); + injectglist(gp); + incidlelocked(1); + } } // retake P's blocked in syscalls // and preempt long running G's @@ -2284,15 +2295,16 @@ retake(int64 now) if(p->runqhead == p->runqtail && runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) > 0) continue; - // Need to increment number of locked M's before the CAS. + // Need to decrement number of idle locked M's + // (pretending that one more is running) before the CAS. // Otherwise the M from which we retake can exit the syscall, // increment nmidle and report deadlock. - inclocked(-1); + incidlelocked(-1); if(runtime·cas(&p->status, s, Pidle)) { n++; handoffp(p); } - inclocked(1); + incidlelocked(1); } else if(s == Prunning) { // Preempt G if it's running for more than 10ms. if(pd->when + 10*1000*1000 > now)