]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compiler,internal/runtime/atomic: optimize xadd{32,64} on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Mon, 3 Apr 2023 04:11:46 +0000 (12:11 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Fri, 8 Nov 2024 01:04:28 +0000 (01:04 +0000)
Use Loong64's atomic operation instruction AMADDDB{W,V} (full barrier)
to implement atomic.Xadd{32,64}

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000 @ 2500.00MHz
          |  bench.old    |  bench.new                            |
          |  sec/op       |  sec/op          vs base              |
Xadd         27.24n ± 0%     12.01n ± 0%    -55.91% (p=0.000 n=20)
Xadd-2       31.93n ± 0%     25.55n ± 0%    -19.98% (p=0.000 n=20)
Xadd-4       31.90n ± 0%     24.80n ± 0%    -22.26% (p=0.000 n=20)
Xadd64       27.23n ± 0%     12.01n ± 0%    -55.89% (p=0.000 n=20)
Xadd64-2     31.93n ± 0%     25.57n ± 0%    -19.90% (p=0.000 n=20)
Xadd64-4     31.89n ± 0%     24.80n ± 0%    -22.23% (p=0.000 n=20)
geomean      30.27n          19.67n         -35.01%

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
          |  bench.old    |  bench.new                           |
          |  sec/op       |  sec/op         vs base              |
Xadd         26.02n ± 0%     12.41n ± 0%   -52.31% (p=0.000 n=20)
Xadd-2       37.36n ± 0%     20.60n ± 0%   -44.86% (p=0.000 n=20)
Xadd-4       37.22n ± 0%     19.59n ± 0%   -47.37% (p=0.000 n=20)
Xadd64       26.42n ± 0%     12.41n ± 0%   -53.03% (p=0.000 n=20)
Xadd64-2     37.77n ± 0%     20.60n ± 0%   -45.46% (p=0.000 n=20)
Xadd64-4     37.78n ± 0%     19.59n ± 0%   -48.15% (p=0.000 n=20)
geomean      33.30n          17.11n        -48.62%

Change-Id: I982539c2aa04680e9dd11b099ba8d5f215bf9b32
Reviewed-on: https://go-review.googlesource.com/c/go/+/481937
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64.rules
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteLOONG64.go
src/internal/runtime/atomic/atomic_loong64.s

index 7cdaa30ffe7d39d69526f606ecc6641d1a2853ff..bec76843786fcf386eca547d038db1f1e791901d 100644 (file)
@@ -694,92 +694,29 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p3.To.Type = obj.TYPE_BRANCH
                p3.To.SetTarget(p)
                s.Prog(loong64.ADBAR)
+
        case ssa.OpLOONG64LoweredAtomicAdd32, ssa.OpLOONG64LoweredAtomicAdd64:
-               // DBAR
-               // LL   (Rarg0), Rout
-               // ADDV Rarg1, Rout, Rtmp
-               // SC   Rtmp, (Rarg0)
-               // BEQ  Rtmp, -3(PC)
-               // DBAR
-               // ADDV Rarg1, Rout
-               ll := loong64.ALLV
-               sc := loong64.ASCV
+               // AMADDx  Rarg1, (Rarg0), Rout
+               // ADDV    Rarg1, Rout, Rout
+               amaddx := loong64.AAMADDDBV
+               addx := loong64.AADDV
                if v.Op == ssa.OpLOONG64LoweredAtomicAdd32 {
-                       ll = loong64.ALL
-                       sc = loong64.ASC
+                       amaddx = loong64.AAMADDDBW
                }
-               s.Prog(loong64.ADBAR)
-               p := s.Prog(ll)
-               p.From.Type = obj.TYPE_MEM
-               p.From.Reg = v.Args[0].Reg()
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = v.Reg0()
-               p1 := s.Prog(loong64.AADDVU)
+               p := s.Prog(amaddx)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = v.Args[1].Reg()
+               p.To.Type = obj.TYPE_MEM
+               p.To.Reg = v.Args[0].Reg()
+               p.RegTo2 = v.Reg0()
+
+               p1 := s.Prog(addx)
                p1.From.Type = obj.TYPE_REG
                p1.From.Reg = v.Args[1].Reg()
                p1.Reg = v.Reg0()
                p1.To.Type = obj.TYPE_REG
-               p1.To.Reg = loong64.REGTMP
-               p2 := s.Prog(sc)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = loong64.REGTMP
-               p2.To.Type = obj.TYPE_MEM
-               p2.To.Reg = v.Args[0].Reg()
-               p3 := s.Prog(loong64.ABEQ)
-               p3.From.Type = obj.TYPE_REG
-               p3.From.Reg = loong64.REGTMP
-               p3.To.Type = obj.TYPE_BRANCH
-               p3.To.SetTarget(p)
-               s.Prog(loong64.ADBAR)
-               p4 := s.Prog(loong64.AADDVU)
-               p4.From.Type = obj.TYPE_REG
-               p4.From.Reg = v.Args[1].Reg()
-               p4.Reg = v.Reg0()
-               p4.To.Type = obj.TYPE_REG
-               p4.To.Reg = v.Reg0()
-       case ssa.OpLOONG64LoweredAtomicAddconst32, ssa.OpLOONG64LoweredAtomicAddconst64:
-               // DBAR
-               // LL   (Rarg0), Rout
-               // ADDV $auxint, Rout, Rtmp
-               // SC   Rtmp, (Rarg0)
-               // BEQ  Rtmp, -3(PC)
-               // DBAR
-               // ADDV $auxint, Rout
-               ll := loong64.ALLV
-               sc := loong64.ASCV
-               if v.Op == ssa.OpLOONG64LoweredAtomicAddconst32 {
-                       ll = loong64.ALL
-                       sc = loong64.ASC
-               }
-               s.Prog(loong64.ADBAR)
-               p := s.Prog(ll)
-               p.From.Type = obj.TYPE_MEM
-               p.From.Reg = v.Args[0].Reg()
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = v.Reg0()
-               p1 := s.Prog(loong64.AADDVU)
-               p1.From.Type = obj.TYPE_CONST
-               p1.From.Offset = v.AuxInt
-               p1.Reg = v.Reg0()
-               p1.To.Type = obj.TYPE_REG
-               p1.To.Reg = loong64.REGTMP
-               p2 := s.Prog(sc)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = loong64.REGTMP
-               p2.To.Type = obj.TYPE_MEM
-               p2.To.Reg = v.Args[0].Reg()
-               p3 := s.Prog(loong64.ABEQ)
-               p3.From.Type = obj.TYPE_REG
-               p3.From.Reg = loong64.REGTMP
-               p3.To.Type = obj.TYPE_BRANCH
-               p3.To.SetTarget(p)
-               s.Prog(loong64.ADBAR)
-               p4 := s.Prog(loong64.AADDVU)
-               p4.From.Type = obj.TYPE_CONST
-               p4.From.Offset = v.AuxInt
-               p4.Reg = v.Reg0()
-               p4.To.Type = obj.TYPE_REG
-               p4.To.Reg = v.Reg0()
+               p1.To.Reg = v.Reg0()
+
        case ssa.OpLOONG64LoweredAtomicCas32, ssa.OpLOONG64LoweredAtomicCas64:
                // MOVV $0, Rout
                // DBAR
index e351c2d402e306600bc500f4a1c2703d79a37f06..383cac40aba73421aa83ef6bb06173aed2fb54ac 100644 (file)
        && is32Bit(int64(off1)+int64(off2)) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink) =>
        (MOV(B|H|W|V)storezero [off1+int32(off2)] {mergeSym(sym1,sym2)} ptr mem)
 
-(LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst32 [int32(c)] ptr mem)
-(LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) => (LoweredAtomicAddconst64 [c] ptr mem)
-
 // don't extend after proper load
 (MOVBreg x:(MOVBload _ _)) => (MOVVreg x)
 (MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x)
index a460882dca8a17ac483ea8ec44f2c0842c8e9313..2d8d87fa4ae76b7e983a73885e16c8b48aa2e371 100644 (file)
@@ -448,18 +448,8 @@ func init() {
 
                // atomic add.
                // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
-               // DBAR
-               // LL   (Rarg0), Rout
-               // ADDV Rarg1, Rout, Rtmp
-               // SC   Rtmp, (Rarg0)
-               // BEQ  Rtmp, -3(PC)
-               // DBAR
-               // ADDV Rarg1, Rout
-               {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
-               {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
-               // *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit.
-               {name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
-               {name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+               {name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+               {name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
 
                // atomic compare and swap.
                // arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
index ac50769dff373539bb60a7a1ac14f7a2ddbeb252..61d3b0462fc7239d014cf27f2fc84442c5011d63 100644 (file)
@@ -1908,8 +1908,6 @@ const (
        OpLOONG64LoweredAtomicExchange64
        OpLOONG64LoweredAtomicAdd32
        OpLOONG64LoweredAtomicAdd64
-       OpLOONG64LoweredAtomicAddconst32
-       OpLOONG64LoweredAtomicAddconst64
        OpLOONG64LoweredAtomicCas32
        OpLOONG64LoweredAtomicCas64
        OpLOONG64LoweredNilCheck
@@ -25580,7 +25578,6 @@ var opcodeTable = [...]opInfo{
                resultNotInArgs: true,
                faultOnNilArg0:  true,
                hasSideEffects:  true,
-               unsafePoint:     true,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
@@ -25597,7 +25594,6 @@ var opcodeTable = [...]opInfo{
                resultNotInArgs: true,
                faultOnNilArg0:  true,
                hasSideEffects:  true,
-               unsafePoint:     true,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 1073741816},          // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
@@ -25608,40 +25604,6 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
-       {
-               name:            "LoweredAtomicAddconst32",
-               auxType:         auxInt32,
-               argLen:          2,
-               resultNotInArgs: true,
-               faultOnNilArg0:  true,
-               hasSideEffects:  true,
-               unsafePoint:     true,
-               reg: regInfo{
-                       inputs: []inputInfo{
-                               {0, 4611686019501129724}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 SB
-                       },
-                       outputs: []outputInfo{
-                               {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
-                       },
-               },
-       },
-       {
-               name:            "LoweredAtomicAddconst64",
-               auxType:         auxInt64,
-               argLen:          2,
-               resultNotInArgs: true,
-               faultOnNilArg0:  true,
-               hasSideEffects:  true,
-               unsafePoint:     true,
-               reg: regInfo{
-                       inputs: []inputInfo{
-                               {0, 4611686019501129724}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31 SB
-                       },
-                       outputs: []outputInfo{
-                               {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
-                       },
-               },
-       },
        {
                name:            "LoweredAtomicCas32",
                argLen:          4,
index 3eaba1871e4e0a93c31f81cb531eb996f6b8201b..14cbd25ee22ee5bfeb5e4a23365c690e4e9a8d9c 100644 (file)
@@ -256,10 +256,6 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpLOONG64DIVV(v)
        case OpLOONG64DIVVU:
                return rewriteValueLOONG64_OpLOONG64DIVVU(v)
-       case OpLOONG64LoweredAtomicAdd32:
-               return rewriteValueLOONG64_OpLOONG64LoweredAtomicAdd32(v)
-       case OpLOONG64LoweredAtomicAdd64:
-               return rewriteValueLOONG64_OpLOONG64LoweredAtomicAdd64(v)
        case OpLOONG64MASKEQZ:
                return rewriteValueLOONG64_OpLOONG64MASKEQZ(v)
        case OpLOONG64MASKNEZ:
@@ -1694,54 +1690,6 @@ func rewriteValueLOONG64_OpLOONG64DIVVU(v *Value) bool {
        }
        return false
 }
-func rewriteValueLOONG64_OpLOONG64LoweredAtomicAdd32(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (LoweredAtomicAdd32 ptr (MOVVconst [c]) mem)
-       // cond: is32Bit(c)
-       // result: (LoweredAtomicAddconst32 [int32(c)] ptr mem)
-       for {
-               ptr := v_0
-               if v_1.Op != OpLOONG64MOVVconst {
-                       break
-               }
-               c := auxIntToInt64(v_1.AuxInt)
-               mem := v_2
-               if !(is32Bit(c)) {
-                       break
-               }
-               v.reset(OpLOONG64LoweredAtomicAddconst32)
-               v.AuxInt = int32ToAuxInt(int32(c))
-               v.AddArg2(ptr, mem)
-               return true
-       }
-       return false
-}
-func rewriteValueLOONG64_OpLOONG64LoweredAtomicAdd64(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (LoweredAtomicAdd64 ptr (MOVVconst [c]) mem)
-       // cond: is32Bit(c)
-       // result: (LoweredAtomicAddconst64 [c] ptr mem)
-       for {
-               ptr := v_0
-               if v_1.Op != OpLOONG64MOVVconst {
-                       break
-               }
-               c := auxIntToInt64(v_1.AuxInt)
-               mem := v_2
-               if !(is32Bit(c)) {
-                       break
-               }
-               v.reset(OpLOONG64LoweredAtomicAddconst64)
-               v.AuxInt = int64ToAuxInt(c)
-               v.AddArg2(ptr, mem)
-               return true
-       }
-       return false
-}
 func rewriteValueLOONG64_OpLOONG64MASKEQZ(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
index 1fe4e99dec9a707348baa841b1500d857a718a52..07d0f584b1da22578f8015c4537903a349e98eea 100644 (file)
@@ -79,6 +79,9 @@ TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
 TEXT ·Loadint64(SB), NOSPLIT, $0-16
        JMP     ·Load64(SB)
 
+TEXT ·Xaddint32(SB),NOSPLIT,$0-20
+       JMP     ·Xadd(SB)
+
 TEXT ·Xaddint64(SB), NOSPLIT, $0-24
        JMP     ·Xadd64(SB)
 
@@ -92,34 +95,25 @@ TEXT ·Xaddint64(SB), NOSPLIT, $0-24
 TEXT ·Casp1(SB), NOSPLIT, $0-25
        JMP     ·Cas64(SB)
 
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// uint32 Xadd(uint32 volatile *ptr, int32 delta)
 // Atomically:
 //     *val += delta;
 //     return *val;
 TEXT ·Xadd(SB), NOSPLIT, $0-20
        MOVV    ptr+0(FP), R4
        MOVW    delta+8(FP), R5
-       DBAR
-       LL      (R4), R6
-       ADDU    R6, R5, R7
-       MOVV    R7, R6
-       SC      R7, (R4)
-       BEQ     R7, -4(PC)
-       MOVW    R6, ret+16(FP)
-       DBAR
+       AMADDDBW        R5, (R4), R6
+       ADDV    R6, R5, R4
+       MOVW    R4, ret+16(FP)
        RET
 
+// func Xadd64(ptr *uint64, delta int64) uint64
 TEXT ·Xadd64(SB), NOSPLIT, $0-24
        MOVV    ptr+0(FP), R4
        MOVV    delta+8(FP), R5
-       DBAR
-       LLV     (R4), R6
-       ADDVU   R6, R5, R7
-       MOVV    R7, R6
-       SCV     R7, (R4)
-       BEQ     R7, -4(PC)
-       MOVV    R6, ret+16(FP)
-       DBAR
+       AMADDDBV        R5, (R4), R6
+       ADDV    R6, R5, R4
+       MOVV    R4, ret+16(FP)
        RET
 
 TEXT ·Xchg(SB), NOSPLIT, $0-20