]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: replace Semacquire/Semrelease implementation
authorDmitriy Vyukov <dvyukov@google.com>
Tue, 28 Jun 2011 19:09:53 +0000 (15:09 -0400)
committerRuss Cox <rsc@golang.org>
Tue, 28 Jun 2011 19:09:53 +0000 (15:09 -0400)
1. The implementation uses distributed hash table of waitlists instead of a centralized one.
  It significantly improves scalability for uncontended semaphores.
2. The implementation provides wait-free fast-path for signalers.
3. The implementation uses less locks (1 lock/unlock instead of 5 for Semacquire).
4. runtime·ready() call is moved out of critical section.
5. Semacquire() does not call semwake().
Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz)
are as follows:
benchmark                                        old ns/op    new ns/op    delta
runtime_test.BenchmarkSemaUncontended                58.20        36.30  -37.63%
runtime_test.BenchmarkSemaUncontended-2             199.00        18.30  -90.80%
runtime_test.BenchmarkSemaUncontended-4             327.00         9.20  -97.19%
runtime_test.BenchmarkSemaUncontended-8             491.00         5.32  -98.92%
runtime_test.BenchmarkSemaUncontended-16            946.00         4.18  -99.56%

runtime_test.BenchmarkSemaSyntNonblock               59.00        36.80  -37.63%
runtime_test.BenchmarkSemaSyntNonblock-2            167.00       138.00  -17.37%
runtime_test.BenchmarkSemaSyntNonblock-4            333.00       129.00  -61.26%
runtime_test.BenchmarkSemaSyntNonblock-8            464.00       130.00  -71.98%
runtime_test.BenchmarkSemaSyntNonblock-16          1015.00       136.00  -86.60%

runtime_test.BenchmarkSemaSyntBlock                  58.80        36.70  -37.59%
runtime_test.BenchmarkSemaSyntBlock-2               294.00       149.00  -49.32%
runtime_test.BenchmarkSemaSyntBlock-4               333.00       177.00  -46.85%
runtime_test.BenchmarkSemaSyntBlock-8               471.00       221.00  -53.08%
runtime_test.BenchmarkSemaSyntBlock-16              990.00       227.00  -77.07%

runtime_test.BenchmarkSemaWorkNonblock              829.00       832.00   +0.36%
runtime_test.BenchmarkSemaWorkNonblock-2            425.00       419.00   -1.41%
runtime_test.BenchmarkSemaWorkNonblock-4            308.00       220.00  -28.57%
runtime_test.BenchmarkSemaWorkNonblock-8            394.00       147.00  -62.69%
runtime_test.BenchmarkSemaWorkNonblock-16          1510.00       149.00  -90.13%

runtime_test.BenchmarkSemaWorkBlock                 828.00       813.00   -1.81%
runtime_test.BenchmarkSemaWorkBlock-2               428.00       436.00   +1.87%
runtime_test.BenchmarkSemaWorkBlock-4               232.00       219.00   -5.60%
runtime_test.BenchmarkSemaWorkBlock-8               392.00       251.00  -35.97%
runtime_test.BenchmarkSemaWorkBlock-16             1524.00       298.00  -80.45%

sync_test.BenchmarkMutexUncontended                  24.10        24.00   -0.41%
sync_test.BenchmarkMutexUncontended-2                12.00        12.00   +0.00%
sync_test.BenchmarkMutexUncontended-4                 6.25         6.17   -1.28%
sync_test.BenchmarkMutexUncontended-8                 3.43         3.34   -2.62%
sync_test.BenchmarkMutexUncontended-16                2.34         2.32   -0.85%

sync_test.BenchmarkMutex                             24.70        24.70   +0.00%
sync_test.BenchmarkMutex-2                          208.00        99.50  -52.16%
sync_test.BenchmarkMutex-4                         2744.00       256.00  -90.67%
sync_test.BenchmarkMutex-8                         5137.00       556.00  -89.18%
sync_test.BenchmarkMutex-16                        5368.00      1284.00  -76.08%

sync_test.BenchmarkMutexSlack                        24.70        25.00   +1.21%
sync_test.BenchmarkMutexSlack-2                    1094.00       186.00  -83.00%
sync_test.BenchmarkMutexSlack-4                    3430.00       402.00  -88.28%
sync_test.BenchmarkMutexSlack-8                    5051.00      1066.00  -78.90%
sync_test.BenchmarkMutexSlack-16                   6806.00      1363.00  -79.97%

sync_test.BenchmarkMutexWork                        793.00       792.00   -0.13%
sync_test.BenchmarkMutexWork-2                      398.00       398.00   +0.00%
sync_test.BenchmarkMutexWork-4                     1441.00       308.00  -78.63%
sync_test.BenchmarkMutexWork-8                     8532.00       847.00  -90.07%
sync_test.BenchmarkMutexWork-16                    8225.00      2760.00  -66.44%

sync_test.BenchmarkMutexWorkSlack                   793.00       793.00   +0.00%
sync_test.BenchmarkMutexWorkSlack-2                 418.00       414.00   -0.96%
sync_test.BenchmarkMutexWorkSlack-4                4481.00       480.00  -89.29%
sync_test.BenchmarkMutexWorkSlack-8                6317.00      1598.00  -74.70%
sync_test.BenchmarkMutexWorkSlack-16               9111.00      3038.00  -66.66%

R=rsc
CC=golang-dev
https://golang.org/cl/4631059

src/pkg/runtime/386/atomic.c [new file with mode: 0644]
src/pkg/runtime/Makefile
src/pkg/runtime/amd64/atomic.c [new file with mode: 0644]
src/pkg/runtime/arm/atomic.c [new file with mode: 0644]
src/pkg/runtime/runtime.h
src/pkg/runtime/sema.goc

diff --git a/src/pkg/runtime/386/atomic.c b/src/pkg/runtime/386/atomic.c
new file mode 100644 (file)
index 0000000..c031cc4
--- /dev/null
@@ -0,0 +1,12 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+#pragma textflag 7
+uint32
+runtime·atomicload(uint32 volatile* addr)
+{
+       return *addr;
+}
index 79f847e64aec2e77094494d485de055d6f09437d..03f960cb86c9e1d381f2d9c6a719f04c1e160701 100644 (file)
@@ -47,6 +47,7 @@ OFILES_arm=\
 
 OFILES=\
        asm.$O\
+       atomic.$O\
        cgocall.$O\
        chan.$O\
        closure.$O\
diff --git a/src/pkg/runtime/amd64/atomic.c b/src/pkg/runtime/amd64/atomic.c
new file mode 100644 (file)
index 0000000..c031cc4
--- /dev/null
@@ -0,0 +1,12 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+#pragma textflag 7
+uint32
+runtime·atomicload(uint32 volatile* addr)
+{
+       return *addr;
+}
diff --git a/src/pkg/runtime/arm/atomic.c b/src/pkg/runtime/arm/atomic.c
new file mode 100644 (file)
index 0000000..9fd47ba
--- /dev/null
@@ -0,0 +1,12 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+#pragma textflag 7
+uint32
+runtime·atomicload(uint32 volatile* addr)
+{
+       return runtime·xadd(addr, 0);
+}
index f3ccff1bcd304ba519807ec743e7bc1d122055c9..7bc0962ba981f6ef5221a77992965ff5e8ad6087 100644 (file)
@@ -416,7 +416,10 @@ int32      runtime·write(int32, void*, int32);
 int32  runtime·mincore(void*, uintptr, byte*);
 bool   runtime·cas(uint32*, uint32, uint32);
 bool   runtime·casp(void**, void*, void*);
+// Don't confuse with XADD x86 instruction,
+// this one is actually 'addx', that is, add-and-fetch.
 uint32 runtime·xadd(uint32 volatile*, int32);
+uint32  runtime·atomicload(uint32 volatile*);
 void   runtime·jmpdefer(byte*, void*);
 void   runtime·exit1(int32);
 void   runtime·ready(G*);
index 1c77e87a53cbd919129d73ab4f6a1b625cb54c51..ae84351edf93af191dadf384dfeb5cf2c0d35dc0 100644 (file)
@@ -23,104 +23,68 @@ package runtime
 typedef struct Sema Sema;
 struct Sema
 {
-       uint32 *addr;
+       uint32 volatile *addr;
        G *g;
        Sema *prev;
        Sema *next;
 };
 
-// TODO: For now, a linked list; maybe a hash table of linked lists later.
-static Sema *semfirst, *semlast;
-static Lock semlock;
+typedef struct SemaRoot SemaRoot;
+struct SemaRoot
+{
+        Lock;
+       Sema *head;
+       Sema *tail;
+       // Number of waiters. Read w/o the lock.
+       uint32 volatile nwait;
+};
+
+// Prime to not correlate with any user patterns.
+#define SEMTABLESZ 251
+
+static union
+{
+       SemaRoot;
+       // Modern processors tend to have 64-byte cache lines,
+       // potentially with 128-byte effective cache line size for reading.
+       // While there are hypothetical architectures
+       // with 16-4096 byte cache lines, 128 looks like a good compromise.
+       uint8 pad[128];
+} semtable[SEMTABLESZ];
+
+static SemaRoot*
+semroot(uint32 *addr)
+{
+       return &semtable[((uintptr)addr >> 3) % SEMTABLESZ];
+}
 
 static void
-semqueue(uint32 *addr, Sema *s)
+semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s)
 {
+       s->g = g;
        s->addr = addr;
-       s->g = nil;
-
-       runtime·lock(&semlock);
-       s->prev = semlast;
        s->next = nil;
-       if(semlast)
-               semlast->next = s;
+       s->prev = root->tail;
+       if(root->tail)
+               root->tail->next = s;
        else
-               semfirst = s;
-       semlast = s;
-       runtime·unlock(&semlock);
+               root->head = s;
+       root->tail = s;
 }
 
 static void
-semdequeue(Sema *s)
+semdequeue(SemaRoot *root, Sema *s)
 {
-       runtime·lock(&semlock);
        if(s->next)
                s->next->prev = s->prev;
        else
-               semlast = s->prev;
+               root->tail = s->prev;
        if(s->prev)
                s->prev->next = s->next;
        else
-               semfirst = s->next;
+               root->head = s->next;
        s->prev = nil;
        s->next = nil;
-       runtime·unlock(&semlock);
-}
-
-static void
-semwakeup(uint32 *addr)
-{
-       Sema *s;
-
-       runtime·lock(&semlock);
-       for(s=semfirst; s; s=s->next) {
-               if(s->addr == addr && s->g) {
-                       runtime·ready(s->g);
-                       s->g = nil;
-                       break;
-               }
-       }
-       runtime·unlock(&semlock);
-}
-
-// Step 1 of sleep: make ourselves available for wakeup.
-// TODO(rsc): Maybe we can write a version without
-// locks by using cas on s->g.  Maybe not: I need to
-// think more about whether it would be correct.
-static void
-semsleep1(Sema *s)
-{
-       runtime·lock(&semlock);
-       s->g = g;
-       runtime·unlock(&semlock);
-}
-
-// Decided not to go through with it: undo step 1.
-static void
-semsleepundo1(Sema *s)
-{
-       runtime·lock(&semlock);
-       if(s->g != nil) {
-               s->g = nil;     // back ourselves out
-       } else {
-               // If s->g == nil already, semwakeup
-               // already readied us.  Since we never stopped
-               // running, readying us just set g->readyonstop.
-               // Clear it.
-               if(g->readyonstop == 0)
-                       *(int32*)0x555 = 555;
-               g->readyonstop = 0;
-       }
-       runtime·unlock(&semlock);
-}
-
-// Step 2: wait for the wakeup.
-static void
-semsleep2(Sema *s)
-{
-       USED(s);
-       g->status = Gwaiting;
-       runtime·gosched();
 }
 
 static int32
@@ -128,52 +92,83 @@ cansemacquire(uint32 *addr)
 {
        uint32 v;
 
-       while((v = *addr) > 0)
+       while((v = runtime·atomicload(addr)) > 0)
                if(runtime·cas(addr, v, v-1))
                        return 1;
        return 0;
 }
 
-// For now has no return value.
-// Might return an ok (not interrupted) bool in the future?
 void
-runtime·semacquire(uint32 *addr)
+runtime·semacquire(uint32 volatile *addr)
 {
        Sema s;
+       SemaRoot *root;
 
        // Easy case.
        if(cansemacquire(addr))
                return;
 
        // Harder case:
-       //      queue
-       //      try semacquire one more time, sleep if failed
-       //      dequeue
-       //      wake up one more guy to avoid races (TODO(rsc): maybe unnecessary?)
-       semqueue(addr, &s);
+       //      increment waiter count
+       //      try cansemacquire one more time, return if succeeded
+       //      enqueue itself as a waiter
+       //      sleep
+       //      (waiter descriptor is dequeued by signaler)
+       root = semroot(addr);
        for(;;) {
-               semsleep1(&s);
+               runtime·lock(root);
+               // Add ourselves to nwait to disable "easy case" in semrelease.
+               runtime·xadd(&root->nwait, 1);
+               // Check cansemacquire to avoid missed wakeup.
                if(cansemacquire(addr)) {
-                       semsleepundo1(&s);
-                       break;
+                       runtime·xadd(&root->nwait, -1);
+                       runtime·unlock(root);
+                       return;
                }
-               semsleep2(&s);
+               // Any semrelease after the cansemacquire knows we're waiting
+               // (we set nwait above), so go to sleep.
+               semqueue(root, addr, &s);
+               g->status = Gwaiting;
+               runtime·unlock(root);
+               runtime·gosched();
+               if(cansemacquire(addr))
+                       return;
        }
-       semdequeue(&s);
-       semwakeup(addr);
 }
 
 void
-runtime·semrelease(uint32 *addr)
+runtime·semrelease(uint32 volatile *addr)
 {
-       uint32 v;
+       Sema *s;
+       SemaRoot *root;
 
-       for(;;) {
-               v = *addr;
-               if(runtime·cas(addr, v, v+1))
+       root = semroot(addr);
+       runtime·xadd(addr, 1);
+
+       // Easy case: no waiters?
+       // This check must happen after the xadd, to avoid a missed wakeup
+       // (see loop in semacquire).
+       if(runtime·atomicload(&root->nwait) == 0)
+               return;
+
+       // Harder case: search for a waiter and wake it.
+       runtime·lock(root);
+       if(runtime·atomicload(&root->nwait) == 0) {
+               // The count is already consumed by another goroutine,
+               // so no need to wake up another goroutine.
+               runtime·unlock(root);
+               return;
+       }
+       for(s = root->head; s; s = s->next) {
+               if(s->addr == addr) {
+                       runtime·xadd(&root->nwait, -1);
+                       semdequeue(root, s);
                        break;
+               }
        }
-       semwakeup(addr);
+       runtime·unlock(root);
+       if(s)
+               runtime·ready(s->g);
 }
 
 func Semacquire(addr *uint32) {