]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Fri, 20 Sep 2024 03:06:18 +0000 (11:06 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Tue, 19 Nov 2024 01:15:07 +0000 (01:15 +0000)
In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic
compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore,
the implementation of the atomic operation compare-and-swap can be selected according
to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new
microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older
microstructures. This can significantly improve the performance of Go programs on
new microstructures.

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
         |  bench.old   |  bench.new                           |
         |   sec/op     |   sec/op       vs base               |
Cas        46.84n ±  0%   22.82n ±  0%  -51.28% (p=0.000 n=20)
Cas-2      47.58n ±  0%   29.57n ±  0%  -37.85% (p=0.000 n=20)
Cas-4      43.27n ± 20%   25.31n ± 13%  -41.50% (p=0.000 n=20)
Cas64      46.85n ±  0%   22.82n ±  0%  -51.29% (p=0.000 n=20)
Cas64-2    47.43n ±  0%   29.53n ±  0%  -37.74% (p=0.002 n=20)
Cas64-4    43.18n ±  0%   25.28n ±  2%  -41.46% (p=0.000 n=20)
geomean    45.82n         25.74n        -43.82%

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000 @ 2500.00MHz
         |  bench.old  |  bench.new                         |
         |   sec/op    |   sec/op      vs base              |
Cas        50.05n ± 0%   51.26n ± 0%  +2.42% (p=0.000 n=20)
Cas-2      52.80n ± 0%   53.11n ± 0%  +0.59% (p=0.000 n=20)
Cas-4      55.97n ± 0%   57.31n ± 0%  +2.39% (p=0.000 n=20)
Cas64      50.05n ± 0%   51.26n ± 0%  +2.42% (p=0.000 n=20)
Cas64-2    52.68n ± 0%   53.11n ± 0%  +0.82% (p=0.000 n=20)
Cas64-4    55.96n ± 0%   57.26n ± 0%  +2.33% (p=0.000 n=20)
geomean    52.86n        53.83n       +1.82%

[1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html

Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322
Reviewed-on: https://go-review.googlesource.com/c/go/+/613396
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

15 files changed:
src/cmd/compile/internal/ir/symtab.go
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64.rules
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteLOONG64.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/ssagen/ssa.go
src/cmd/compile/internal/typecheck/_builtin/runtime.go
src/cmd/compile/internal/typecheck/builtin.go
src/cmd/internal/goobj/builtinlist.go
src/internal/runtime/atomic/atomic_loong64.go
src/internal/runtime/atomic/atomic_loong64.s
src/runtime/cpuflags.go
src/runtime/proc.go

index c977a6b94ed4f7b04c519c79a379737692823a6c..1cc8d93f1003142e525f784dd4e46e1b99921871 100644 (file)
@@ -60,6 +60,7 @@ type symsStruct struct {
        Zerobase         *obj.LSym
        ARM64HasATOMICS  *obj.LSym
        ARMHasVFPv4      *obj.LSym
+       Loong64HasLAMCAS *obj.LSym
        Loong64HasLAM_BH *obj.LSym
        Loong64HasLSX    *obj.LSym
        X86HasFMA        *obj.LSym
index a52a2c0eca1e045644d049a6b68f2301d1b3d0a2..f46ec74a28b252ac1e15853d6225a26a654f4900 100644 (file)
@@ -746,52 +746,64 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
        case ssa.OpLOONG64LoweredAtomicCas32, ssa.OpLOONG64LoweredAtomicCas64:
                // MOVV $0, Rout
-               // DBAR
+               // DBAR 0x14
                // LL   (Rarg0), Rtmp
                // BNE  Rtmp, Rarg1, 4(PC)
                // MOVV Rarg2, Rout
                // SC   Rout, (Rarg0)
                // BEQ  Rout, -4(PC)
-               // DBAR
+               // DBAR 0x12
                ll := loong64.ALLV
                sc := loong64.ASCV
                if v.Op == ssa.OpLOONG64LoweredAtomicCas32 {
                        ll = loong64.ALL
                        sc = loong64.ASC
                }
+
                p := s.Prog(loong64.AMOVV)
                p.From.Type = obj.TYPE_REG
                p.From.Reg = loong64.REGZERO
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg0()
-               s.Prog(loong64.ADBAR)
-               p1 := s.Prog(ll)
-               p1.From.Type = obj.TYPE_MEM
-               p1.From.Reg = v.Args[0].Reg()
-               p1.To.Type = obj.TYPE_REG
-               p1.To.Reg = loong64.REGTMP
-               p2 := s.Prog(loong64.ABNE)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = v.Args[1].Reg()
-               p2.Reg = loong64.REGTMP
-               p2.To.Type = obj.TYPE_BRANCH
-               p3 := s.Prog(loong64.AMOVV)
+
+               p1 := s.Prog(loong64.ADBAR)
+               p1.From.Type = obj.TYPE_CONST
+               p1.From.Offset = 0x14
+
+               p2 := s.Prog(ll)
+               p2.From.Type = obj.TYPE_MEM
+               p2.From.Reg = v.Args[0].Reg()
+               p2.To.Type = obj.TYPE_REG
+               p2.To.Reg = loong64.REGTMP
+
+               p3 := s.Prog(loong64.ABNE)
                p3.From.Type = obj.TYPE_REG
-               p3.From.Reg = v.Args[2].Reg()
-               p3.To.Type = obj.TYPE_REG
-               p3.To.Reg = v.Reg0()
-               p4 := s.Prog(sc)
+               p3.From.Reg = v.Args[1].Reg()
+               p3.Reg = loong64.REGTMP
+               p3.To.Type = obj.TYPE_BRANCH
+
+               p4 := s.Prog(loong64.AMOVV)
                p4.From.Type = obj.TYPE_REG
-               p4.From.Reg = v.Reg0()
-               p4.To.Type = obj.TYPE_MEM
-               p4.To.Reg = v.Args[0].Reg()
-               p5 := s.Prog(loong64.ABEQ)
+               p4.From.Reg = v.Args[2].Reg()
+               p4.To.Type = obj.TYPE_REG
+               p4.To.Reg = v.Reg0()
+
+               p5 := s.Prog(sc)
                p5.From.Type = obj.TYPE_REG
                p5.From.Reg = v.Reg0()
-               p5.To.Type = obj.TYPE_BRANCH
-               p5.To.SetTarget(p1)
-               p6 := s.Prog(loong64.ADBAR)
-               p2.To.SetTarget(p6)
+               p5.To.Type = obj.TYPE_MEM
+               p5.To.Reg = v.Args[0].Reg()
+
+               p6 := s.Prog(loong64.ABEQ)
+               p6.From.Type = obj.TYPE_REG
+               p6.From.Reg = v.Reg0()
+               p6.To.Type = obj.TYPE_BRANCH
+               p6.To.SetTarget(p2)
+
+               p7 := s.Prog(loong64.ADBAR)
+               p7.From.Type = obj.TYPE_CONST
+               p7.From.Offset = 0x12
+               p3.To.SetTarget(p7)
 
        case ssa.OpLOONG64LoweredAtomicAnd32,
                ssa.OpLOONG64LoweredAtomicOr32:
@@ -815,6 +827,53 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.To.Reg = v.Args[0].Reg()
                p.RegTo2 = v.Reg0()
 
+       case ssa.OpLOONG64LoweredAtomicCas64Variant, ssa.OpLOONG64LoweredAtomicCas32Variant:
+               // MOVV         $0, Rout
+               // MOVV         Rarg1, Rtmp
+               // AMCASDBx     Rarg2, (Rarg0), Rtmp
+               // BNE          Rarg1, Rtmp, 2(PC)
+               // MOVV         $1, Rout
+               // NOP
+
+               amcasx := loong64.AAMCASDBV
+               if v.Op == ssa.OpLOONG64LoweredAtomicCas32Variant {
+                       amcasx = loong64.AAMCASDBW
+               }
+
+               p := s.Prog(loong64.AMOVV)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = loong64.REGZERO
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = v.Reg0()
+
+               p1 := s.Prog(loong64.AMOVV)
+               p1.From.Type = obj.TYPE_REG
+               p1.From.Reg = v.Args[1].Reg()
+               p1.To.Type = obj.TYPE_REG
+               p1.To.Reg = loong64.REGTMP
+
+               p2 := s.Prog(amcasx)
+               p2.From.Type = obj.TYPE_REG
+               p2.From.Reg = v.Args[2].Reg()
+               p2.To.Type = obj.TYPE_MEM
+               p2.To.Reg = v.Args[0].Reg()
+               p2.RegTo2 = loong64.REGTMP
+
+               p3 := s.Prog(loong64.ABNE)
+               p3.From.Type = obj.TYPE_REG
+               p3.From.Reg = v.Args[1].Reg()
+               p3.Reg = loong64.REGTMP
+               p3.To.Type = obj.TYPE_BRANCH
+
+               p4 := s.Prog(loong64.AMOVV)
+               p4.From.Type = obj.TYPE_CONST
+               p4.From.Offset = 0x1
+               p4.To.Type = obj.TYPE_REG
+               p4.To.Reg = v.Reg0()
+
+               p5 := s.Prog(obj.ANOP)
+               p3.To.SetTarget(p5)
+
        case ssa.OpLOONG64LoweredNilCheck:
                // Issue a load which will fault if arg is nil.
                p := s.Prog(loong64.AMOVB)
index eba495f21dc47fc467d116c86c13de80c76ca98f..1f1434c4be9eb834725d680fddefb2340bf2d16c 100644 (file)
 
 (AtomicAdd(32|64) ...) => (LoweredAtomicAdd(32|64) ...)
 
+// Loong64's 32-bit atomic operation instructions ll.w and amcasw are both sign-extended,
+// so the input parameters need to be sign-extended to 64 bits, otherwise the subsequent
+// comparison operations may not produce the expected results.
+//
 (AtomicCompareAndSwap32 ptr old new mem) => (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
 (AtomicCompareAndSwap64 ...) => (LoweredAtomicCas64 ...)
+(AtomicCompareAndSwap32Variant ptr old new mem) => (LoweredAtomicCas32Variant ptr (SignExt32to64 old) new mem)
+(AtomicCompareAndSwap64Variant ...) => (LoweredAtomicCas64Variant ...)
 
 // Atomic memory logical operations (old style).
 //
index 270c262e8edc4f9324f763564ecff56c833e5efa..360458b96ae31a1eaa560e7d518cf2b3b048f223 100644 (file)
@@ -479,17 +479,34 @@ func init() {
                // } else {
                //   return (false, memory)
                // }
-               // DBAR
                // MOVV $0, Rout
+               // DBAR 0x14
                // LL   (Rarg0), Rtmp
                // BNE  Rtmp, Rarg1, 4(PC)
                // MOVV Rarg2, Rout
                // SC   Rout, (Rarg0)
                // BEQ  Rout, -4(PC)
-               // DBAR
+               // DBAR 0x12
                {name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
                {name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 
+               // atomic compare and swap variant.
+               // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
+               // if *arg0 == arg1 {
+               //   *arg0 = arg2
+               //   return (true, memory)
+               // } else {
+               //   return (false, memory)
+               // }
+               // MOVV         $0, Rout
+               // MOVV         Rarg1, Rtmp
+               // AMCASDBx     Rarg2, (Rarg0), Rtmp
+               // BNE          Rarg1, Rtmp, 2(PC)
+               // MOVV         $1, Rout
+               // NOP
+               {name: "LoweredAtomicCas64Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+               {name: "LoweredAtomicCas32Variant", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+
                // Atomic 32 bit AND/OR.
                // *arg0 &= (|=) arg1. arg2=mem. returns nil.
                {name: "LoweredAtomicAnd32", argLength: 3, reg: gpxchg, asm: "AMANDDBW", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
index db4f17317d2ed76910654fafccf0650153a97a3e..f7179d98d575a3fe72904eaa59a6b28638166c3a 100644 (file)
@@ -1928,6 +1928,8 @@ const (
        OpLOONG64LoweredAtomicAdd64
        OpLOONG64LoweredAtomicCas32
        OpLOONG64LoweredAtomicCas64
+       OpLOONG64LoweredAtomicCas64Variant
+       OpLOONG64LoweredAtomicCas32Variant
        OpLOONG64LoweredAtomicAnd32
        OpLOONG64LoweredAtomicOr32
        OpLOONG64LoweredAtomicAnd32value
@@ -25921,6 +25923,42 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:            "LoweredAtomicCas64Variant",
+               argLen:          4,
+               resultNotInArgs: true,
+               faultOnNilArg0:  true,
+               hasSideEffects:  true,
+               unsafePoint:     true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                               {2, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                               {0, 4611686019501129724}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+                       },
+               },
+       },
+       {
+               name:            "LoweredAtomicCas32Variant",
+               argLen:          4,
+               resultNotInArgs: true,
+               faultOnNilArg0:  true,
+               hasSideEffects:  true,
+               unsafePoint:     true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                               {2, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                               {0, 4611686019501129724}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+                       },
+               },
+       },
        {
                name:            "LoweredAtomicAnd32",
                argLen:          3,
index fd0f938a432ce93ed8b62f9474dba7b48d5a3110..40265bd1249f14938f75f5115c8e3e9fa14c4c63 100644 (file)
@@ -66,9 +66,14 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpAtomicAnd8(v)
        case OpAtomicCompareAndSwap32:
                return rewriteValueLOONG64_OpAtomicCompareAndSwap32(v)
+       case OpAtomicCompareAndSwap32Variant:
+               return rewriteValueLOONG64_OpAtomicCompareAndSwap32Variant(v)
        case OpAtomicCompareAndSwap64:
                v.Op = OpLOONG64LoweredAtomicCas64
                return true
+       case OpAtomicCompareAndSwap64Variant:
+               v.Op = OpLOONG64LoweredAtomicCas64Variant
+               return true
        case OpAtomicExchange32:
                v.Op = OpLOONG64LoweredAtomicExchange32
                return true
@@ -915,6 +920,27 @@ func rewriteValueLOONG64_OpAtomicCompareAndSwap32(v *Value) bool {
                return true
        }
 }
+func rewriteValueLOONG64_OpAtomicCompareAndSwap32Variant(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       typ := &b.Func.Config.Types
+       // match: (AtomicCompareAndSwap32Variant ptr old new mem)
+       // result: (LoweredAtomicCas32Variant ptr (SignExt32to64 old) new mem)
+       for {
+               ptr := v_0
+               old := v_1
+               new := v_2
+               mem := v_3
+               v.reset(OpLOONG64LoweredAtomicCas32Variant)
+               v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+               v0.AddArg(old)
+               v.AddArg4(ptr, v0, new, mem)
+               return true
+       }
+}
 func rewriteValueLOONG64_OpAtomicOr8(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
index 6cf3eb9cfe033f3c331e541c531078719542ec28..f69d7bdc667574708c24ab4aa69a1e70662ffd65 100644 (file)
@@ -298,7 +298,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                },
                sys.PPC64)
 
-       makeAtomicGuardedIntrinsicLoong64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
+       makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
                return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        // Target Atomic feature is identified by dynamic detection
                        addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
@@ -315,29 +315,21 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
 
                        // We have atomic instructions - use it directly.
                        s.startBlock(bTrue)
-                       emit(s, n, args, op1, typ, needReturn)
+                       emit(s, n, args, op1, typ, false)
                        s.endBlock().AddEdgeTo(bEnd)
 
                        // Use original instruction sequence.
                        s.startBlock(bFalse)
-                       emit(s, n, args, op0, typ, needReturn)
+                       emit(s, n, args, op0, typ, false)
                        s.endBlock().AddEdgeTo(bEnd)
 
                        // Merge results.
                        s.startBlock(bEnd)
 
-                       if needReturn {
-                               return s.variable(n, types.Types[typ])
-                       } else {
-                               return nil
-                       }
+                       return nil
                }
        }
 
-       makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
-               return makeAtomicGuardedIntrinsicLoong64common(op0, op1, typ, emit, false)
-       }
-
        atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
                v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
                s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
@@ -475,14 +467,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                        s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
                        return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
                },
-               sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+               sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
        addF("internal/runtime/atomic", "Cas64",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
                        s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
                        return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
                },
-               sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+               sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
        addF("internal/runtime/atomic", "CasRel",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
@@ -506,6 +498,53 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
                sys.ARM64)
 
+       atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+               v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+               s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+               if needReturn {
+                       s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+               }
+       }
+
+       makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
+               return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       // Target Atomic feature is identified by dynamic detection
+                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
+                       v := s.load(types.Types[types.TBOOL], addr)
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely
+
+                       // We have atomic instructions - use it directly.
+                       s.startBlock(bTrue)
+                       emit(s, n, args, op1, types.TBOOL, true)
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Use original instruction sequence.
+                       s.startBlock(bFalse)
+                       emit(s, n, args, op0, types.TBOOL, true)
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+
+                       return s.variable(n, types.Types[types.TBOOL])
+               }
+       }
+
+       addF("internal/runtime/atomic", "Cas",
+               makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
+               sys.Loong64)
+       addF("internal/runtime/atomic", "Cas64",
+               makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
+               sys.Loong64)
+
        // Old-style atomic logical operation API (all supported archs except arm64).
        addF("internal/runtime/atomic", "And8",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
index 0f92ccf1b57c2eabda3d03c76993702eb9d3dfe1..dc9b508c01ed12c69970eb9797f5f55c4e3fc47f 100644 (file)
@@ -150,6 +150,7 @@ func InitConfig() {
        ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA")               // bool
        ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4")           // bool
        ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS")   // bool
+       ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
        ir.Syms.Loong64HasLAM_BH = typecheck.LookupRuntimeVar("loong64HasLAM_BH") // bool
        ir.Syms.Loong64HasLSX = typecheck.LookupRuntimeVar("loong64HasLSX")       // bool
        ir.Syms.Staticuint64s = typecheck.LookupRuntimeVar("staticuint64s")
index 464fe1becbd809c16982e8841caf48e61382c479..9a83911487830a96890561b877b9b886ccc793af 100644 (file)
@@ -289,6 +289,7 @@ var x86HasSSE41 bool
 var x86HasFMA bool
 var armHasVFPv4 bool
 var arm64HasATOMICS bool
+var loong64HasLAMCAS bool
 var loong64HasLAM_BH bool
 var loong64HasLSX bool
 
index c8fc913f9b899a4b513ef0f9770acb35bc775087..6860d78b2e4024b2b332343a657c5122f1b3fc57 100644 (file)
@@ -237,6 +237,7 @@ var runtimeDecls = [...]struct {
        {"x86HasFMA", varTag, 6},
        {"armHasVFPv4", varTag, 6},
        {"arm64HasATOMICS", varTag, 6},
+       {"loong64HasLAMCAS", varTag, 6},
        {"loong64HasLAM_BH", varTag, 6},
        {"loong64HasLSX", varTag, 6},
        {"asanregisterglobals", funcTag, 130},
index e9b8d6aade9d30a6c337fbb9e62eee17dac5059a..c133c60427f59855c281e658d53dfaebb98f61ae 100644 (file)
@@ -216,6 +216,7 @@ var builtins = [...]struct {
        {"runtime.x86HasFMA", 0},
        {"runtime.armHasVFPv4", 0},
        {"runtime.arm64HasATOMICS", 0},
+       {"runtime.loong64HasLAMCAS", 0},
        {"runtime.loong64HasLAM_BH", 0},
        {"runtime.loong64HasLSX", 0},
        {"runtime.asanregisterglobals", 1},
index a36262832348124425c7d1b82214db5c660a3ff2..6586ad2f6ce6a12c610a28226a37475436fe62e3 100644 (file)
@@ -12,6 +12,7 @@ import (
 )
 
 const (
+       offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
        offsetLoong64HasLAM_BH = unsafe.Offsetof(cpu.Loong64.HasLAM_BH)
 )
 
index 60741a23c25d6dbb0f70339174c046efb1445c85..d67300afc4f70d8cdc987120df4f9767de06788d 100644 (file)
@@ -16,18 +16,32 @@ TEXT ·Cas(SB), NOSPLIT, $0-17
        MOVV    ptr+0(FP), R4
        MOVW    old+8(FP), R5
        MOVW    new+12(FP), R6
-       DBAR
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
+       BEQ     R8, cas_again
+       MOVV    R5, R7  // backup old value
+       AMCASDBW        R6, (R4), R5
+       BNE     R7, R5, cas_fail0
+       MOVV    $1, R4
+       MOVB    R4, ret+16(FP)
+       RET
+cas_fail0:
+       MOVB    R0, ret+16(FP)
+       RET
+
+       // Implemented using the ll-sc instruction pair
+       DBAR    $0x14   // LoadAcquire barrier
 cas_again:
        MOVV    R6, R7
        LL      (R4), R8
-       BNE     R5, R8, cas_fail
+       BNE     R5, R8, cas_fail1
        SC      R7, (R4)
        BEQ     R7, cas_again
        MOVV    $1, R4
        MOVB    R4, ret+16(FP)
-       DBAR
+       DBAR    $0x12   // StoreRelease barrier
        RET
-cas_fail:
+cas_fail1:
        MOVV    $0, R4
        JMP     -4(PC)
 
@@ -43,21 +57,41 @@ TEXT ·Cas64(SB), NOSPLIT, $0-25
        MOVV    ptr+0(FP), R4
        MOVV    old+8(FP), R5
        MOVV    new+16(FP), R6
-       DBAR
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
+       BEQ     R8, cas64_again
+       MOVV    R5, R7  // backup old value
+       AMCASDBV        R6, (R4), R5
+       BNE     R7, R5, cas64_fail0
+       MOVV    $1, R4
+       MOVB    R4, ret+24(FP)
+       RET
+cas64_fail0:
+       MOVB    R0, ret+24(FP)
+       RET
+
+       // Implemented using the ll-sc instruction pair
+       DBAR    $0x14
 cas64_again:
        MOVV    R6, R7
        LLV     (R4), R8
-       BNE     R5, R8, cas64_fail
+       BNE     R5, R8, cas64_fail1
        SCV     R7, (R4)
        BEQ     R7, cas64_again
        MOVV    $1, R4
        MOVB    R4, ret+24(FP)
-       DBAR
+       DBAR    $0x12
        RET
-cas64_fail:
+cas64_fail1:
        MOVV    $0, R4
        JMP     -4(PC)
 
+TEXT ·Casint32(SB),NOSPLIT,$0-17
+       JMP     ·Cas(SB)
+
+TEXT ·Casint64(SB),NOSPLIT,$0-25
+       JMP     ·Cas64(SB)
+
 TEXT ·Casuintptr(SB), NOSPLIT, $0-25
        JMP     ·Cas64(SB)
 
index 3f88d20fb3590602d92ecfda980d044083df8137..e81e50f5dfcb483eaa3a95b4a1b734a0a3ac1930 100644 (file)
@@ -34,6 +34,7 @@ var (
 
        arm64HasATOMICS bool
 
+       loong64HasLAMCAS bool
        loong64HasLAM_BH bool
        loong64HasLSX    bool
 )
index cbfac3a92372d30e2634025e42d8b0dd52e7644c..3f360ef1291916703d0c5312e9a1a5fb5abf996d 100644 (file)
@@ -752,6 +752,7 @@ func cpuinit(env string) {
                arm64HasATOMICS = cpu.ARM64.HasATOMICS
 
        case "loong64":
+               loong64HasLAMCAS = cpu.Loong64.HasLAMCAS
                loong64HasLAM_BH = cpu.Loong64.HasLAM_BH
                loong64HasLSX = cpu.Loong64.HasLSX
        }