]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: use native CAS and memory barrier on ARMv7
authorCherry Zhang <cherryyz@google.com>
Wed, 14 Feb 2018 19:51:39 +0000 (14:51 -0500)
committerCherry Zhang <cherryyz@google.com>
Thu, 3 May 2018 21:35:39 +0000 (21:35 +0000)
This gets us around the kernel helpers on ARMv7.

It is slightly faster than using the kernel helper.

name           old time/op  new time/op  delta
AtomicLoad-4   72.5ns ± 0%  69.5ns ± 0%  -4.08%  (p=0.000 n=9+9)
AtomicStore-4  57.6ns ± 1%  54.4ns ± 0%  -5.58%  (p=0.000 n=10+9)
[Geo mean]     64.6ns       61.5ns       -4.83%

If performance is really critical, we can even do compiler intrinsics
on GOARM=7.

Fixes #23792.

Change-Id: I36497d880890b26bdf01e048b542bd5fd7b17d23
Reviewed-on: https://go-review.googlesource.com/94076
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
src/runtime/internal/atomic/bench_test.go
src/runtime/internal/atomic/sys_linux_arm.s
src/runtime/sys_linux_arm.s

index 47010e32d538da1f490a24a23e66720591991316..2a22e88fb80980ffb3ef883d1563a0cec23686d2 100644 (file)
@@ -26,3 +26,19 @@ func BenchmarkAtomicStore64(b *testing.B) {
                atomic.Store64(&x, 0)
        }
 }
+
+func BenchmarkAtomicLoad(b *testing.B) {
+       var x uint32
+       sink = &x
+       for i := 0; i < b.N; i++ {
+               _ = atomic.Load(&x)
+       }
+}
+
+func BenchmarkAtomicStore(b *testing.B) {
+       var x uint32
+       sink = &x
+       for i := 0; i < b.N; i++ {
+               atomic.Store(&x, 0)
+       }
+}
index 60f28e721614c6ce38ddf2b37bece2bc09f3a996..7e234d8f2678fabd7213f628b8834c1604eed8be 100644 (file)
 TEXT cas<>(SB),NOSPLIT,$0
        MOVW    $0xffff0fc0, R15 // R15 is hardware PC.
 
-TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT|NOFRAME,$0
+       MOVB    runtime·goarm(SB), R11
+       CMP     $7, R11
+       BLT     2(PC)
+       JMP     ·armcas(SB)
+       JMP     ·kernelcas<>(SB)
+
+TEXT runtime∕internal∕atomic·kernelcas<>(SB),NOSPLIT,$0
        MOVW    ptr+0(FP), R2
        // trigger potential paging fault here,
        // because we don't know how to traceback through __kuser_cmpxchg
index fc9dc9bbb8f51beabfa9a00bdfb21f6b7ef73927..aa39732cfb66dbab004d9ac55cd64a0fd85f66e7 100644 (file)
@@ -489,13 +489,18 @@ TEXT runtime·usleep(SB),NOSPLIT,$12
 // even on single-core devices. The kernel helper takes care of all of
 // this for us.
 
-TEXT publicationBarrier<>(SB),NOSPLIT,$0
+TEXT kernelPublicationBarrier<>(SB),NOSPLIT,$0
        // void __kuser_memory_barrier(void);
-       MOVW    $0xffff0fa0, R15 // R15 is hardware PC.
+       MOVW    $0xffff0fa0, R11
+       CALL    (R11)
+       RET
 
 TEXT ·publicationBarrier(SB),NOSPLIT,$0
-       BL      publicationBarrier<>(SB)
-       RET
+       MOVB    ·goarm(SB), R11
+       CMP     $7, R11
+       BLT     2(PC)
+       JMP     ·armPublicationBarrier(SB)
+       JMP     kernelPublicationBarrier<>(SB) // extra layer so this function is leaf and no SP adjustment on GOARM=7
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
        MOVW    $SYS_sched_yield, R7