]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile, internal/runtime/atomic: add Xchg8 for loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Wed, 30 Oct 2024 11:11:49 +0000 (19:11 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Wed, 20 Nov 2024 02:58:50 +0000 (02:58 +0000)
In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic
instruction AMSWAP[DB]{B,H} [1] is supported. Therefore, the implementation of
the atomic operation exchange can be selected according to the CPUCFG flag LAM_BH:
AMSWAPDBB(full barrier) instruction is used on new microstructures, and traditional
LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can
significantly improve the performance of Go programs on new microstructures.

Because Xchg8 implemented using traditional LL-SC uses too many temporary
registers, it is not suitable for intrinsics.

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
BenchmarkXchg8              100000000         10.41 ns/op
BenchmarkXchg8-2            100000000         10.41 ns/op
BenchmarkXchg8-4            100000000         10.41 ns/op
BenchmarkXchg8Parallel      96647592         12.41 ns/op
BenchmarkXchg8Parallel-2    58376136         20.60 ns/op
BenchmarkXchg8Parallel-4    78458899         17.97 ns/op

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000-HV @ 2500.00MHz
BenchmarkXchg8              38323825         31.23 ns/op
BenchmarkXchg8-2            38368219         31.23 ns/op
BenchmarkXchg8-4            37154156         31.26 ns/op
BenchmarkXchg8Parallel      37908301         31.63 ns/op
BenchmarkXchg8Parallel-2    30413440         39.42 ns/op
BenchmarkXchg8Parallel-4    30737626         39.03 ns/op

For #69735

[1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html

Change-Id: I02ba68f66a2210b6902344fdc9975eb62de728ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/623058
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64.rules
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteLOONG64.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/ssagen/intrinsics_test.go
src/internal/runtime/atomic/atomic_loong64.go
src/internal/runtime/atomic/atomic_loong64.s
src/internal/runtime/atomic/xchg8_test.go

index f46ec74a28b252ac1e15853d6225a26a654f4900..0ba9efa1d364274170233e0a246b2257e66e91fc 100644 (file)
@@ -722,6 +722,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.To.Reg = v.Args[0].Reg()
                p.RegTo2 = v.Reg0()
 
+       case ssa.OpLOONG64LoweredAtomicExchange8Variant:
+               // AMSWAPDBB    Rarg1, (Rarg0), Rout
+               p := s.Prog(loong64.AAMSWAPDBB)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = v.Args[1].Reg()
+               p.To.Type = obj.TYPE_MEM
+               p.To.Reg = v.Args[0].Reg()
+               p.RegTo2 = v.Reg0()
+
        case ssa.OpLOONG64LoweredAtomicAdd32, ssa.OpLOONG64LoweredAtomicAdd64:
                // AMADDx  Rarg1, (Rarg0), Rout
                // ADDV    Rarg1, Rout, Rout
index 1f1434c4be9eb834725d680fddefb2340bf2d16c..00a0a84f33425362c4d8a27e037ed4b034f974af 100644 (file)
 (AtomicStorePtrNoWB ...) => (LoweredAtomicStore64 ...)
 
 (AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
+(AtomicExchange8Variant  ...) => (LoweredAtomicExchange8Variant  ...)
 
 (AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
 
index 360458b96ae31a1eaa560e7d518cf2b3b048f223..8f17158b6434056ba9fedcb1788faf4d7b8dc5f9 100644 (file)
@@ -466,6 +466,11 @@ func init() {
                {name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
                {name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
 
+               // atomic exchange variant.
+               // store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
+               // AMSWAPDBB   Rarg1, (Rarg0), Rout
+               {name: "LoweredAtomicExchange8Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
                // atomic add.
                // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
                {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
index f7179d98d575a3fe72904eaa59a6b28638166c3a..86d8924943093a836bd7c9559d501a1419516da2 100644 (file)
@@ -1924,6 +1924,7 @@ const (
        OpLOONG64LoweredAtomicStore64Variant
        OpLOONG64LoweredAtomicExchange32
        OpLOONG64LoweredAtomicExchange64
+       OpLOONG64LoweredAtomicExchange8Variant
        OpLOONG64LoweredAtomicAdd32
        OpLOONG64LoweredAtomicAdd64
        OpLOONG64LoweredAtomicCas32
@@ -25855,6 +25856,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:            "LoweredAtomicExchange8Variant",
+               argLen:          3,
+               resultNotInArgs: true,
+               faultOnNilArg0:  true,
+               hasSideEffects:  true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                               {0, 4611686019501129724}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+                       },
+               },
+       },
        {
                name:            "LoweredAtomicAdd32",
                argLen:          3,
index 40265bd1249f14938f75f5115c8e3e9fa14c4c63..ab39040de196e00b402990e824a17510adb81860 100644 (file)
@@ -80,6 +80,9 @@ func rewriteValueLOONG64(v *Value) bool {
        case OpAtomicExchange64:
                v.Op = OpLOONG64LoweredAtomicExchange64
                return true
+       case OpAtomicExchange8Variant:
+               v.Op = OpLOONG64LoweredAtomicExchange8Variant
+               return true
        case OpAtomicLoad32:
                v.Op = OpLOONG64LoweredAtomicLoad32
                return true
index f69d7bdc667574708c24ab4aa69a1e70662ffd65..8a721b41348b81c7fb7fc9df024459149ab09144 100644 (file)
@@ -439,6 +439,41 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
                sys.ARM64)
 
+       makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
+                       v := s.load(types.Types[types.TBOOL], addr)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
+                       s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
+                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[types.TUINT8])
+               }
+       }
+       addF("internal/runtime/atomic", "Xchg8",
+               makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
+               sys.Loong64)
+
        addF("internal/runtime/atomic", "Xadd",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
index 7603327b2f5b20fc6928ffb7e8ace9bd80bf98e2..df2a6b3187282b3a3760ea58bdc6012a52c8cc2a 100644 (file)
@@ -392,6 +392,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
        {"loong64", "internal/runtime/atomic", "Xaddint32"}:        struct{}{},
        {"loong64", "internal/runtime/atomic", "Xaddint64"}:        struct{}{},
        {"loong64", "internal/runtime/atomic", "Xadduintptr"}:      struct{}{},
+       {"loong64", "internal/runtime/atomic", "Xchg8"}:            struct{}{},
        {"loong64", "internal/runtime/atomic", "Xchg"}:             struct{}{},
        {"loong64", "internal/runtime/atomic", "Xchg64"}:           struct{}{},
        {"loong64", "internal/runtime/atomic", "Xchgint32"}:        struct{}{},
index 6586ad2f6ce6a12c610a28226a37475436fe62e3..1fa1a9fa5ab4a114163b27b69ab0b0d2684a3e6f 100644 (file)
@@ -25,6 +25,9 @@ func Xadd64(ptr *uint64, delta int64) uint64
 //go:noescape
 func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
 
+//go:noescape
+func Xchg8(ptr *uint8, new uint8) uint8
+
 //go:noescape
 func Xchg(ptr *uint32, new uint32) uint32
 
index d67300afc4f70d8cdc987120df4f9767de06788d..5222b77e77fa7701d633a376e2839960f70c1213 100644 (file)
@@ -150,6 +150,44 @@ TEXT ·Xadd64(SB), NOSPLIT, $0-24
        MOVV    R4, ret+16(FP)
        RET
 
+// uint8 Xchg8(ptr *uint8, new uint8)
+// Atomically:
+//     old := *ptr;
+//     *ptr = new;
+//     return old;
+TEXT ·Xchg8(SB), NOSPLIT, $0-17
+       MOVV    ptr+0(FP), R4
+       MOVBU   new+8(FP), R5
+
+       // R6 = ((ptr & 3) * 8)
+       AND     $3, R4, R6
+       SLLV    $3, R6
+
+       // R7 = ((0xFF) << R6) ^ (-1)
+       MOVV    $0xFF, R8
+       SLLV    R6, R8, R7
+       XOR     $-1, R7
+
+       // R4 = ptr & (~3)
+       MOVV    $~3, R8
+       AND     R8, R4
+
+       // R5 = ((val) << R6)
+       SLLV    R6, R5
+
+       DBAR    $0x14   // LoadAcquire barrier
+_xchg8_again:
+       LL      (R4), R8
+       MOVV    R8, R9  // backup old val
+       AND     R7, R8
+       OR      R5, R8
+       SC      R8, (R4)
+       BEQ     R8, _xchg8_again
+       DBAR    $0x12   // StoreRelease barrier
+       SRLV    R6, R9, R9
+       MOVBU   R9, ret+16(FP)
+       RET
+
 // func Xchg(ptr *uint32, new uint32) uint32
 TEXT ·Xchg(SB), NOSPLIT, $0-20
        MOVV    ptr+0(FP), R4
index d9c0a8dd24362705a3fbf36c2b50d4acb1816c17..016ce819b0f42e4cbdce1cd6d6961b40c070ca25 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build 386 || amd64 || arm || arm64 || ppc64 || ppc64le
+//go:build 386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le
 
 package atomic_test