From 3214129a835d7ab5b857c77b8389a9aba4f9d5df Mon Sep 17 00:00:00 2001 From: Guoqi Chen Date: Tue, 23 Apr 2024 17:01:28 +0800 Subject: [PATCH] cmd/compiler,internal/runtime/atomic: optimize Load{64,32,8} on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The LoadAcquire barrier on Loong64 is "dbar 0x14", using the correct barrier in Load{8,32,64} implementation can improve performance. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicLoad64 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad64-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad64-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad 17.220n ± 0% 4.402n ± 0% -74.44% (p=0.000 n=20) AtomicLoad-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) geomean 17.21n 4.402n -74.42% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicLoad64 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad64-2 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20) AtomicLoad64-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20) AtomicLoad-2 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad-4 18.81n ± 0% 10.42n ± 0% -44.63% (p=0.000 n=20) AtomicLoad8 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad8-2 18.82n ± 0% 10.41n ± 0% -44.70% (p=0.000 n=20) AtomicLoad8-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) geomean 18.82n 10.41n -44.68% Change-Id: I9d47c9d6f359c4f2e41035ca656429aade2e7847 Reviewed-on: https://go-review.googlesource.com/c/go/+/581357 Reviewed-by: Cherry Mui Reviewed-by: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/loong64/ssa.go | 8 ++++++-- src/internal/runtime/atomic/atomic_loong64.s | 16 ++++------------ src/internal/runtime/atomic/bench_test.go | 8 ++++++++ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go index 1b1fdfdc71..7cdf5637f2 100644 --- a/src/cmd/compile/internal/loong64/ssa.go +++ b/src/cmd/compile/internal/loong64/ssa.go @@ -468,6 +468,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt] s.UseArgs(16) // space used in callee args area by assembly stubs case ssa.OpLOONG64LoweredAtomicLoad8, ssa.OpLOONG64LoweredAtomicLoad32, ssa.OpLOONG64LoweredAtomicLoad64: + // MOVB (Rarg0), Rout + // DBAR 0x14 as := loong64.AMOVV switch v.Op { case ssa.OpLOONG64LoweredAtomicLoad8: @@ -475,13 +477,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpLOONG64LoweredAtomicLoad32: as = loong64.AMOVW } - s.Prog(loong64.ADBAR) p := s.Prog(as) p.From.Type = obj.TYPE_MEM p.From.Reg = v.Args[0].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg0() - s.Prog(loong64.ADBAR) + p1 := s.Prog(loong64.ADBAR) + p1.From.Type = obj.TYPE_CONST + p1.From.Offset = 0x14 + case ssa.OpLOONG64LoweredAtomicStore8, ssa.OpLOONG64LoweredAtomicStore32, ssa.OpLOONG64LoweredAtomicStore64: as := loong64.AMOVV switch v.Op { diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s index 1812cb95fd..9bed8654c8 100644 --- a/src/internal/runtime/atomic/atomic_loong64.s +++ b/src/internal/runtime/atomic/atomic_loong64.s @@ -319,38 +319,30 @@ TEXT ·Oruintptr(SB), NOSPLIT, $0-24 // uint32 internal∕runtime∕atomic·Load(uint32 volatile* ptr) TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12 MOVV ptr+0(FP), R19 - DBAR MOVWU 0(R19), R19 - DBAR + DBAR $0x14 // LoadAcquire barrier MOVW R19, ret+8(FP) RET // uint8 internal∕runtime∕atomic·Load8(uint8 volatile* ptr) TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9 MOVV ptr+0(FP), R19 - DBAR MOVBU 0(R19), R19 - DBAR + DBAR $0x14 MOVB R19, ret+8(FP) RET // uint64 internal∕runtime∕atomic·Load64(uint64 volatile* ptr) TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16 MOVV ptr+0(FP), R19 - DBAR MOVV 0(R19), R19 - DBAR + DBAR $0x14 MOVV R19, ret+8(FP) RET // void *internal∕runtime∕atomic·Loadp(void *volatile *ptr) TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16 - MOVV ptr+0(FP), R19 - DBAR - MOVV 0(R19), R19 - DBAR - MOVV R19, ret+8(FP) - RET + JMP ·Load64(SB) // uint32 internal∕runtime∕atomic·LoadAcq(uint32 volatile* ptr) TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12 diff --git a/src/internal/runtime/atomic/bench_test.go b/src/internal/runtime/atomic/bench_test.go index 798431cf72..6e3f14cbe4 100644 --- a/src/internal/runtime/atomic/bench_test.go +++ b/src/internal/runtime/atomic/bench_test.go @@ -43,6 +43,14 @@ func BenchmarkAtomicStore(b *testing.B) { } } +func BenchmarkAtomicLoad8(b *testing.B) { + var x uint8 + sink = &x + for i := 0; i < b.N; i++ { + atomic.Load8(&x) + } +} + func BenchmarkAnd8(b *testing.B) { var x [512]uint8 // give byte its own cache line sink = &x -- 2.48.1