]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal/ppc64, runtime internal/atomic, sync/atomic: implement faster...
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Thu, 4 Jan 2018 20:23:27 +0000 (18:23 -0200)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 22 Mar 2018 14:13:01 +0000 (14:13 +0000)
This change implements faster atomics for ppc64x based on the ISA 2.07B,
Appendix B.2 recommendations, replacing SYNC/ISYNC by LWSYNC in some
cases.

Updates #21348

name                                           old time/op new time/op    delta
Cond1-16                                           955ns     856ns      -10.33%
Cond2-16                                          2.38µs    2.03µs      -14.59%
Cond4-16                                          5.90µs    5.44µs       -7.88%
Cond8-16                                          12.1µs    11.1µs       -8.42%
Cond16-16                                         27.0µs    25.1µs       -7.04%
Cond32-16                                         59.1µs    55.5µs       -6.14%
LoadMostlyHits/*sync_test.DeepCopyMap-16          22.1ns    24.1ns       +9.02%
LoadMostlyHits/*sync_test.RWMutexMap-16            252ns     249ns       -1.20%
LoadMostlyHits/*sync.Map-16                       16.2ns    16.3ns         ~
LoadMostlyMisses/*sync_test.DeepCopyMap-16        22.3ns    22.6ns         ~
LoadMostlyMisses/*sync_test.RWMutexMap-16          249ns     247ns       -0.51%
LoadMostlyMisses/*sync.Map-16                     12.7ns    12.7ns         ~
LoadOrStoreBalanced/*sync_test.RWMutexMap-16      1.27µs    1.17µs       -7.54%
LoadOrStoreBalanced/*sync.Map-16                  1.12µs    1.10µs       -2.35%
LoadOrStoreUnique/*sync_test.RWMutexMap-16        1.75µs    1.68µs       -3.84%
LoadOrStoreUnique/*sync.Map-16                    2.07µs    1.97µs       -5.13%
LoadOrStoreCollision/*sync_test.DeepCopyMap-16    15.8ns    15.9ns         ~
LoadOrStoreCollision/*sync_test.RWMutexMap-16      496ns     424ns      -14.48%
LoadOrStoreCollision/*sync.Map-16                 6.07ns    6.07ns         ~
Range/*sync_test.DeepCopyMap-16                   1.65µs    1.64µs         ~
Range/*sync_test.RWMutexMap-16                     278µs     288µs       +3.75%
Range/*sync.Map-16                                2.00µs    2.01µs         ~
AdversarialAlloc/*sync_test.DeepCopyMap-16        3.45µs    3.44µs         ~
AdversarialAlloc/*sync_test.RWMutexMap-16          226ns     227ns         ~
AdversarialAlloc/*sync.Map-16                     1.09µs    1.07µs       -2.36%
AdversarialDelete/*sync_test.DeepCopyMap-16        553ns     550ns       -0.57%
AdversarialDelete/*sync_test.RWMutexMap-16         273ns     274ns         ~
AdversarialDelete/*sync.Map-16                     247ns     249ns         ~
UncontendedSemaphore-16                           79.0ns    65.5ns      -17.11%
ContendedSemaphore-16                              112ns      97ns      -13.77%
MutexUncontended-16                               3.34ns    2.51ns      -24.69%
Mutex-16                                           266ns     191ns      -28.26%
MutexSlack-16                                      226ns     159ns      -29.55%
MutexWork-16                                       377ns     338ns      -10.14%
MutexWorkSlack-16                                  335ns     308ns       -8.20%
MutexNoSpin-16                                     196ns     184ns       -5.91%
MutexSpin-16                                       710ns     666ns       -6.21%
Once-16                                           1.29ns    1.29ns         ~
Pool-16                                           8.64ns    8.71ns         ~
PoolOverflow-16                                   1.60µs    1.44µs      -10.25%
SemaUncontended-16                                5.39ns    4.42ns      -17.96%
SemaSyntNonblock-16                                539ns     483ns      -10.42%
SemaSyntBlock-16                                   413ns     354ns      -14.20%
SemaWorkNonblock-16                                305ns     258ns      -15.36%
SemaWorkBlock-16                                   266ns     229ns      -14.06%
RWMutexUncontended-16                             12.9ns     9.7ns      -24.80%
RWMutexWrite100-16                                 203ns     147ns      -27.47%
RWMutexWrite10-16                                  177ns     119ns      -32.74%
RWMutexWorkWrite100-16                             435ns     403ns       -7.39%
RWMutexWorkWrite10-16                              642ns     611ns       -4.79%
WaitGroupUncontended-16                           4.67ns    3.70ns      -20.92%
WaitGroupAddDone-16                                402ns     355ns      -11.54%
WaitGroupAddDoneWork-16                            208ns     250ns      +20.09%
WaitGroupWait-16                                  1.21ns    1.21ns         ~
WaitGroupWaitWork-16                              5.91ns    5.87ns       -0.81%
WaitGroupActuallyWait-16                          92.2ns    85.8ns       -6.91%

Updates #21348

Change-Id: Ibb9b271d11b308264103829e176c6d9fe8f867d3
Reviewed-on: https://go-review.googlesource.com/95175
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
src/cmd/compile/internal/ppc64/ssa.go
src/runtime/internal/atomic/asm_ppc64x.s
src/sync/atomic/asm_ppc64x.s

index 7a2e2c187832658d89c756a1e33a53530f407ca8..8d843f07567d7ea8d69c2af3c7d81742b52116d1 100644 (file)
@@ -154,16 +154,17 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 
        case ssa.OpPPC64LoweredAtomicAnd8,
                ssa.OpPPC64LoweredAtomicOr8:
-               // SYNC
+               // LWSYNC
                // LBAR         (Rarg0), Rtmp
                // AND/OR       Rarg1, Rtmp
                // STBCCC       Rtmp, (Rarg0)
                // BNE          -3(PC)
-               // ISYNC
                r0 := v.Args[0].Reg()
                r1 := v.Args[1].Reg()
-               psync := s.Prog(ppc64.ASYNC)
-               psync.To.Type = obj.TYPE_NONE
+               // LWSYNC - Assuming shared data not write-through-required nor
+               // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
+               plwsync := s.Prog(ppc64.ALWSYNC)
+               plwsync.To.Type = obj.TYPE_NONE
                p := s.Prog(ppc64.ALBAR)
                p.From.Type = obj.TYPE_MEM
                p.From.Reg = r0
@@ -183,17 +184,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p3 := s.Prog(ppc64.ABNE)
                p3.To.Type = obj.TYPE_BRANCH
                gc.Patch(p3, p)
-               pisync := s.Prog(ppc64.AISYNC)
-               pisync.To.Type = obj.TYPE_NONE
 
        case ssa.OpPPC64LoweredAtomicAdd32,
                ssa.OpPPC64LoweredAtomicAdd64:
-               // SYNC
+               // LWSYNC
                // LDAR/LWAR    (Rarg0), Rout
                // ADD          Rarg1, Rout
                // STDCCC/STWCCC Rout, (Rarg0)
                // BNE         -3(PC)
-               // ISYNC
                // MOVW         Rout,Rout (if Add32)
                ld := ppc64.ALDAR
                st := ppc64.ASTDCCC
@@ -204,9 +202,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                r0 := v.Args[0].Reg()
                r1 := v.Args[1].Reg()
                out := v.Reg0()
-               // SYNC
-               psync := s.Prog(ppc64.ASYNC)
-               psync.To.Type = obj.TYPE_NONE
+               // LWSYNC - Assuming shared data not write-through-required nor
+               // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
+               plwsync := s.Prog(ppc64.ALWSYNC)
+               plwsync.To.Type = obj.TYPE_NONE
                // LDAR or LWAR
                p := s.Prog(ld)
                p.From.Type = obj.TYPE_MEM
@@ -229,9 +228,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p4 := s.Prog(ppc64.ABNE)
                p4.To.Type = obj.TYPE_BRANCH
                gc.Patch(p4, p)
-               // ISYNC
-               pisync := s.Prog(ppc64.AISYNC)
-               pisync.To.Type = obj.TYPE_NONE
 
                // Ensure a 32 bit result
                if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
@@ -244,7 +240,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 
        case ssa.OpPPC64LoweredAtomicExchange32,
                ssa.OpPPC64LoweredAtomicExchange64:
-               // SYNC
+               // LWSYNC
                // LDAR/LWAR    (Rarg0), Rout
                // STDCCC/STWCCC Rout, (Rarg0)
                // BNE         -2(PC)
@@ -258,9 +254,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                r0 := v.Args[0].Reg()
                r1 := v.Args[1].Reg()
                out := v.Reg0()
-               // SYNC
-               psync := s.Prog(ppc64.ASYNC)
-               psync.To.Type = obj.TYPE_NONE
+               // LWSYNC - Assuming shared data not write-through-required nor
+               // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
+               plwsync := s.Prog(ppc64.ALWSYNC)
+               plwsync.To.Type = obj.TYPE_NONE
                // LDAR or LWAR
                p := s.Prog(ld)
                p.From.Type = obj.TYPE_MEM
@@ -342,14 +339,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 
        case ssa.OpPPC64LoweredAtomicCas64,
                ssa.OpPPC64LoweredAtomicCas32:
-               // SYNC
+               // LWSYNC
                // loop:
                // LDAR        (Rarg0), Rtmp
                // CMP         Rarg1, Rtmp
                // BNE         fail
                // STDCCC      Rarg2, (Rarg0)
                // BNE         loop
-               // ISYNC
+               // LWSYNC
                // MOVD        $1, Rout
                // BR          end
                // fail:
@@ -367,9 +364,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                r1 := v.Args[1].Reg()
                r2 := v.Args[2].Reg()
                out := v.Reg0()
-               // SYNC
-               psync := s.Prog(ppc64.ASYNC)
-               psync.To.Type = obj.TYPE_NONE
+               // LWSYNC - Assuming shared data not write-through-required nor
+               // caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
+               plwsync1 := s.Prog(ppc64.ALWSYNC)
+               plwsync1.To.Type = obj.TYPE_NONE
                // LDAR or LWAR
                p := s.Prog(ld)
                p.From.Type = obj.TYPE_MEM
@@ -395,9 +393,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p4 := s.Prog(ppc64.ABNE)
                p4.To.Type = obj.TYPE_BRANCH
                gc.Patch(p4, p)
-               // ISYNC
-               pisync := s.Prog(ppc64.AISYNC)
-               pisync.To.Type = obj.TYPE_NONE
+               // LWSYNC - Assuming shared data not write-through-required nor
+               // caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b.
+               plwsync2 := s.Prog(ppc64.ALWSYNC)
+               plwsync2.To.Type = obj.TYPE_NONE
                // return true
                p5 := s.Prog(ppc64.AMOVD)
                p5.From.Type = obj.TYPE_CONST
index 7117aef1580790322ef41062209bf3f0ae842ef5..a2ed4adc91a0332bf95ed0ee349e61d9a90394e4 100644 (file)
@@ -17,7 +17,7 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
        MOVD    ptr+0(FP), R3
        MOVWZ   old+8(FP), R4
        MOVWZ   new+12(FP), R5
-       SYNC
+       LWSYNC
 cas_again:
        LWAR    (R3), R6
        CMPW    R6, R4
@@ -25,7 +25,7 @@ cas_again:
        STWCCC  R5, (R3)
        BNE     cas_again
        MOVD    $1, R3
-       ISYNC
+       LWSYNC
        MOVB    R3, ret+16(FP)
        RET
 cas_fail:
@@ -44,7 +44,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
        MOVD    ptr+0(FP), R3
        MOVD    old+8(FP), R4
        MOVD    new+16(FP), R5
-       SYNC
+       LWSYNC
 cas64_again:
        LDAR    (R3), R6
        CMP     R6, R4
@@ -52,7 +52,7 @@ cas64_again:
        STDCCC  R5, (R3)
        BNE     cas64_again
        MOVD    $1, R3
-       ISYNC
+       LWSYNC
        MOVB    R3, ret+24(FP)
        RET
 cas64_fail:
@@ -97,31 +97,29 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
 TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
        MOVD    ptr+0(FP), R4
        MOVW    delta+8(FP), R5
-       SYNC
+       LWSYNC
        LWAR    (R4), R3
        ADD     R5, R3
        STWCCC  R3, (R4)
        BNE     -3(PC)
-       ISYNC
        MOVW    R3, ret+16(FP)
        RET
 
 TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
        MOVD    ptr+0(FP), R4
        MOVD    delta+8(FP), R5
-       SYNC
+       LWSYNC
        LDAR    (R4), R3
        ADD     R5, R3
        STDCCC  R3, (R4)
        BNE     -3(PC)
-       ISYNC
        MOVD    R3, ret+16(FP)
        RET
 
 TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
        MOVD    ptr+0(FP), R4
        MOVW    new+8(FP), R5
-       SYNC
+       LWSYNC
        LWAR    (R4), R3
        STWCCC  R5, (R4)
        BNE     -2(PC)
@@ -132,7 +130,7 @@ TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
 TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
        MOVD    ptr+0(FP), R4
        MOVD    new+8(FP), R5
-       SYNC
+       LWSYNC
        LDAR    (R4), R3
        STDCCC  R5, (R4)
        BNE     -2(PC)
@@ -165,24 +163,22 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
        MOVD    ptr+0(FP), R3
        MOVBZ   val+8(FP), R4
-       SYNC
+       LWSYNC
 again:
        LBAR    (R3), R6
        OR      R4, R6
        STBCCC  R6, (R3)
        BNE     again
-       ISYNC
        RET
 
 // void runtime∕internal∕atomic·And8(byte volatile*, byte);
 TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
        MOVD    ptr+0(FP), R3
        MOVBZ   val+8(FP), R4
-       SYNC
+       LWSYNC
 again:
        LBAR    (R3),R6
        AND     R4,R6
        STBCCC  R6,(R3)
        BNE     again
-       ISYNC
        RET
index 44e26698b47fb271c370f7b0d661ee55f9f26a6f..dc93ed8e1d69278f8325500112403088a4d6d8e9 100644 (file)
@@ -12,7 +12,7 @@ TEXT ·SwapInt32(SB),NOSPLIT,$0-20
 TEXT ·SwapUint32(SB),NOSPLIT,$0-20
        MOVD    addr+0(FP), R3
        MOVW    new+8(FP), R4
-       SYNC
+       LWSYNC
        LWAR    (R3), R5
        STWCCC  R4, (R3)
        BNE     -2(PC)
@@ -26,7 +26,7 @@ TEXT ·SwapInt64(SB),NOSPLIT,$0-24
 TEXT ·SwapUint64(SB),NOSPLIT,$0-24
        MOVD    addr+0(FP), R3
        MOVD    new+8(FP), R4
-       SYNC
+       LWSYNC
        LDAR    (R3), R5
        STDCCC  R4, (R3)
        BNE     -2(PC)
@@ -44,13 +44,13 @@ TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
        MOVD    addr+0(FP), R3
        MOVW    old+8(FP), R4
        MOVW    new+12(FP), R5
-       SYNC
+       LWSYNC
        LWAR    (R3), R6
        CMPW    R6, R4
        BNE     7(PC)
        STWCCC  R5, (R3)
        BNE     -4(PC)
-       ISYNC
+       LWSYNC
        MOVD    $1, R3
        MOVB    R3, swapped+16(FP)
        RET
@@ -67,13 +67,13 @@ TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
        MOVD    addr+0(FP), R3
        MOVD    old+8(FP), R4
        MOVD    new+16(FP), R5
-       SYNC
+       LWSYNC
        LDAR    (R3), R6
        CMP     R6, R4
        BNE     7(PC)
        STDCCC  R5, (R3)
        BNE     -4(PC)
-       ISYNC
+       LWSYNC
        MOVD    $1, R3
        MOVB    R3, swapped+24(FP)
        RET
@@ -86,12 +86,11 @@ TEXT ·AddInt32(SB),NOSPLIT,$0-20
 TEXT ·AddUint32(SB),NOSPLIT,$0-20
        MOVD    addr+0(FP), R3
        MOVW    delta+8(FP), R4
-       SYNC
+       LWSYNC
        LWAR    (R3), R5
        ADD     R4, R5
        STWCCC  R5, (R3)
        BNE     -3(PC)
-       ISYNC
        MOVW    R5, new+16(FP)
        RET
 
@@ -104,12 +103,11 @@ TEXT ·AddInt64(SB),NOSPLIT,$0-24
 TEXT ·AddUint64(SB),NOSPLIT,$0-24
        MOVD    addr+0(FP), R3
        MOVD    delta+8(FP), R4
-       SYNC
+       LWSYNC
        LDAR    (R3), R5
        ADD     R4, R5
        STDCCC  R5, (R3)
        BNE     -3(PC)
-       ISYNC
        MOVD    R5, new+16(FP)
        RET