]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compiler,internal/runtime/atomic: optimize Load{64,32,8} on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Tue, 23 Apr 2024 09:01:28 +0000 (17:01 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Thu, 1 Aug 2024 02:17:13 +0000 (02:17 +0000)
The LoadAcquire barrier on Loong64 is "dbar 0x14", using the correct
barrier in Load{8,32,64} implementation can improve performance.

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000-HV @ 2500.00MHz
                |  bench.old   | bench.new                           |
                |  sec/op      |  sec/op        vs base              |
AtomicLoad64      17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad64-2    17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad64-4    17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad        17.220n ± 0%   4.402n ± 0%   -74.44% (p=0.000 n=20)
AtomicLoad-2      17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad-4      17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad8       17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad8-2     17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
AtomicLoad8-4     17.210n ± 0%   4.402n ± 0%   -74.42% (p=0.000 n=20)
geomean           17.21n         4.402n        -74.42%

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000 @ 2500.00MHz
                |  bench.old   | bench.new                           |
                |  sec/op      |  sec/op        vs base              |
AtomicLoad64      18.82n ± 0%    10.41n ± 0%   -44.69% (p=0.000 n=20)
AtomicLoad64-2    18.81n ± 0%    10.41n ± 0%   -44.66% (p=0.000 n=20)
AtomicLoad64-4    18.82n ± 0%    10.41n ± 0%   -44.69% (p=0.000 n=20)
AtomicLoad        18.81n ± 0%    10.41n ± 0%   -44.66% (p=0.000 n=20)
AtomicLoad-2      18.82n ± 0%    10.41n ± 0%   -44.69% (p=0.000 n=20)
AtomicLoad-4      18.81n ± 0%    10.42n ± 0%   -44.63% (p=0.000 n=20)
AtomicLoad8       18.82n ± 0%    10.41n ± 0%   -44.69% (p=0.000 n=20)
AtomicLoad8-2     18.82n ± 0%    10.41n ± 0%   -44.70% (p=0.000 n=20)
AtomicLoad8-4     18.82n ± 0%    10.41n ± 0%   -44.69% (p=0.000 n=20)
geomean           18.82n         10.41n        -44.68%

Change-Id: I9d47c9d6f359c4f2e41035ca656429aade2e7847
Reviewed-on: https://go-review.googlesource.com/c/go/+/581357
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/loong64/ssa.go
src/internal/runtime/atomic/atomic_loong64.s
src/internal/runtime/atomic/bench_test.go

index 1b1fdfdc71437294a8eba3dde9096983abd41b5b..7cdf5637f201426c53e7faa9c3f7a6df5fffec10 100644 (file)
@@ -468,6 +468,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
                s.UseArgs(16) // space used in callee args area by assembly stubs
        case ssa.OpLOONG64LoweredAtomicLoad8, ssa.OpLOONG64LoweredAtomicLoad32, ssa.OpLOONG64LoweredAtomicLoad64:
+               // MOVB (Rarg0), Rout
+               // DBAR 0x14
                as := loong64.AMOVV
                switch v.Op {
                case ssa.OpLOONG64LoweredAtomicLoad8:
@@ -475,13 +477,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                case ssa.OpLOONG64LoweredAtomicLoad32:
                        as = loong64.AMOVW
                }
-               s.Prog(loong64.ADBAR)
                p := s.Prog(as)
                p.From.Type = obj.TYPE_MEM
                p.From.Reg = v.Args[0].Reg()
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg0()
-               s.Prog(loong64.ADBAR)
+               p1 := s.Prog(loong64.ADBAR)
+               p1.From.Type = obj.TYPE_CONST
+               p1.From.Offset = 0x14
+
        case ssa.OpLOONG64LoweredAtomicStore8, ssa.OpLOONG64LoweredAtomicStore32, ssa.OpLOONG64LoweredAtomicStore64:
                as := loong64.AMOVV
                switch v.Op {
index 1812cb95fd4edd4a9185f4de3d3ecb4a6b55571c..9bed8654c85c17a4e133bd8fe41a378b292df930 100644 (file)
@@ -319,38 +319,30 @@ TEXT ·Oruintptr(SB), NOSPLIT, $0-24
 // uint32 internal∕runtime∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
        MOVV    ptr+0(FP), R19
-       DBAR
        MOVWU   0(R19), R19
-       DBAR
+       DBAR    $0x14   // LoadAcquire barrier
        MOVW    R19, ret+8(FP)
        RET
 
 // uint8 internal∕runtime∕atomic·Load8(uint8 volatile* ptr)
 TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
        MOVV    ptr+0(FP), R19
-       DBAR
        MOVBU   0(R19), R19
-       DBAR
+       DBAR    $0x14
        MOVB    R19, ret+8(FP)
        RET
 
 // uint64 internal∕runtime∕atomic·Load64(uint64 volatile* ptr)
 TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
        MOVV    ptr+0(FP), R19
-       DBAR
        MOVV    0(R19), R19
-       DBAR
+       DBAR    $0x14
        MOVV    R19, ret+8(FP)
        RET
 
 // void *internal∕runtime∕atomic·Loadp(void *volatile *ptr)
 TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
-       MOVV    ptr+0(FP), R19
-       DBAR
-       MOVV    0(R19), R19
-       DBAR
-       MOVV    R19, ret+8(FP)
-       RET
+       JMP     ·Load64(SB)
 
 // uint32 internal∕runtime∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
index 798431cf7258c2918e99ea5f67832ee355de46ad..6e3f14cbe463846a9bdd1f1d1df72480ae4f1c0c 100644 (file)
@@ -43,6 +43,14 @@ func BenchmarkAtomicStore(b *testing.B) {
        }
 }
 
+func BenchmarkAtomicLoad8(b *testing.B) {
+       var x uint8
+       sink = &x
+       for i := 0; i < b.N; i++ {
+               atomic.Load8(&x)
+       }
+}
+
 func BenchmarkAnd8(b *testing.B) {
        var x [512]uint8 // give byte its own cache line
        sink = &x