arm64.ALDADDB, arm64.ALDADDH, arm64.ALDADDW, arm64.ALDADDD,
arm64.ALDANDB, arm64.ALDANDH, arm64.ALDANDW, arm64.ALDANDD,
arm64.ALDEORB, arm64.ALDEORH, arm64.ALDEORW, arm64.ALDEORD,
- arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD:
+ arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD,
+ arm64.ALDADDALD, arm64.ALDADDALW:
return true
}
return false
LDORH R5, (RSP), R7 // e7332578
LDORB R5, (R6), R7 // c7302538
LDORB R5, (RSP), R7 // e7332538
+ LDADDALD R2, (R1), R3 // 2300e2f8
+ LDADDALW R5, (R4), R6 // 8600e5b8
// RET
//
p3.From.Reg = arm64.REGTMP
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
+ case ssa.OpARM64LoweredAtomicAdd64Variant,
+ ssa.OpARM64LoweredAtomicAdd32Variant:
+ // LDADDAL Rarg1, (Rarg0), Rout
+ // ADD Rarg1, Rout
+ op := arm64.ALDADDALD
+ if v.Op == ssa.OpARM64LoweredAtomicAdd32Variant {
+ op = arm64.ALDADDALW
+ }
+ r0 := v.Args[0].Reg()
+ r1 := v.Args[1].Reg()
+ out := v.Reg0()
+ p := s.Prog(op)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = r0
+ p.RegTo2 = out
+ p1 := s.Prog(arm64.AADD)
+ p1.From.Type = obj.TYPE_REG
+ p1.From.Reg = r1
+ p1.To.Type = obj.TYPE_REG
+ p1.To.Reg = out
case ssa.OpARM64LoweredAtomicCas64,
ssa.OpARM64LoweredAtomicCas32:
// LDAXR (Rarg0), Rtmp
racewriterange,
supportPopcnt,
supportSSE41,
+ arm64SupportAtomics,
typedmemclr,
typedmemmove,
Udiv,
racewriterange = sysfunc("racewriterange")
supportPopcnt = sysfunc("support_popcnt")
supportSSE41 = sysfunc("support_sse41")
+ arm64SupportAtomics = sysfunc("arm64_support_atomics")
typedmemclr = sysfunc("typedmemclr")
typedmemmove = sysfunc("typedmemmove")
Udiv = sysfunc("udiv")
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
},
- sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
+ sys.AMD64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
addF("runtime/internal/atomic", "Xadd64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
},
- sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS64, sys.PPC64)
+ sys.AMD64, sys.S390X, sys.MIPS64, sys.PPC64)
+
+ makeXaddARM64 := func(op0 ssa.Op, op1 ssa.Op, ty types.EType) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ // Target Atomic feature is identified by dynamic detection
+ addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64SupportAtomics, s.sb)
+ v := s.load(types.Types[TBOOL], addr)
+ b := s.endBlock()
+ b.Kind = ssa.BlockIf
+ b.SetControl(v)
+ bTrue := s.f.NewBlock(ssa.BlockPlain)
+ bFalse := s.f.NewBlock(ssa.BlockPlain)
+ bEnd := s.f.NewBlock(ssa.BlockPlain)
+ b.AddEdgeTo(bTrue)
+ b.AddEdgeTo(bFalse)
+ b.Likely = ssa.BranchUnlikely // most machines don't have Atomics nowadays
+
+ // We have atomic instructions - use it directly.
+ s.startBlock(bTrue)
+ v0 := s.newValue3(op1, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
+ s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v0)
+ s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v0)
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Use original instruction sequence.
+ s.startBlock(bFalse)
+ v1 := s.newValue3(op0, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
+ s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v1)
+ s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v1)
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Merge results.
+ s.startBlock(bEnd)
+ return s.variable(n, types.Types[ty])
+ }
+ }
+
+ addF("runtime/internal/atomic", "Xadd",
+ makeXaddARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32),
+ sys.ARM64)
+ addF("runtime/internal/atomic", "Xadd64",
+ makeXaddARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64),
+ sys.ARM64)
addF("runtime/internal/atomic", "Cas",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
(AtomicAnd8 ptr val mem) -> (Select1 (LoweredAtomicAnd8 ptr val mem))
(AtomicOr8 ptr val mem) -> (Select1 (LoweredAtomicOr8 ptr val mem))
+(AtomicAdd32Variant ptr val mem) -> (LoweredAtomicAdd32Variant ptr val mem)
+(AtomicAdd64Variant ptr val mem) -> (LoweredAtomicAdd64Variant ptr val mem)
+
// Write barrier.
(WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)
{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+ // atomic add variant.
+ // *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
+ // LDADDAL (Rarg0), Rarg1, Rout
+ // ADD Rarg1, Rout
+ {name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+ {name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
+
// atomic compare and swap.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
// if *arg0 == arg1 {
{name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
{name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.
+ // Atomic operation variants
+ // These variants have the same semantics as above atomic operations.
+ // But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
+ // Currently, they are used on ARM64 only.
+ {name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+ {name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
+
// Clobber experiment op
{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
}
OpARM64LoweredAtomicExchange32
OpARM64LoweredAtomicAdd64
OpARM64LoweredAtomicAdd32
+ OpARM64LoweredAtomicAdd64Variant
+ OpARM64LoweredAtomicAdd32Variant
OpARM64LoweredAtomicCas64
OpARM64LoweredAtomicCas32
OpARM64LoweredAtomicAnd8
OpAtomicCompareAndSwap64
OpAtomicAnd8
OpAtomicOr8
+ OpAtomicAdd32Variant
+ OpAtomicAdd64Variant
OpClobber
)
},
},
},
+ {
+ name: "LoweredAtomicAdd64Variant",
+ argLen: 3,
+ resultNotInArgs: true,
+ faultOnNilArg0: true,
+ hasSideEffects: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+ {0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+ },
+ outputs: []outputInfo{
+ {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+ },
+ },
+ },
+ {
+ name: "LoweredAtomicAdd32Variant",
+ argLen: 3,
+ resultNotInArgs: true,
+ faultOnNilArg0: true,
+ hasSideEffects: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+ {0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
+ },
+ outputs: []outputInfo{
+ {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+ },
+ },
+ },
{
name: "LoweredAtomicCas64",
argLen: 4,
hasSideEffects: true,
generic: true,
},
+ {
+ name: "AtomicAdd32Variant",
+ argLen: 3,
+ hasSideEffects: true,
+ generic: true,
+ },
+ {
+ name: "AtomicAdd64Variant",
+ argLen: 3,
+ hasSideEffects: true,
+ generic: true,
+ },
{
name: "Clobber",
auxType: auxSymOff,
return rewriteValueARM64_OpAndB_0(v)
case OpAtomicAdd32:
return rewriteValueARM64_OpAtomicAdd32_0(v)
+ case OpAtomicAdd32Variant:
+ return rewriteValueARM64_OpAtomicAdd32Variant_0(v)
case OpAtomicAdd64:
return rewriteValueARM64_OpAtomicAdd64_0(v)
+ case OpAtomicAdd64Variant:
+ return rewriteValueARM64_OpAtomicAdd64Variant_0(v)
case OpAtomicAnd8:
return rewriteValueARM64_OpAtomicAnd8_0(v)
case OpAtomicCompareAndSwap32:
return true
}
}
+func rewriteValueARM64_OpAtomicAdd32Variant_0(v *Value) bool {
+ // match: (AtomicAdd32Variant ptr val mem)
+ // cond:
+ // result: (LoweredAtomicAdd32Variant ptr val mem)
+ for {
+ _ = v.Args[2]
+ ptr := v.Args[0]
+ val := v.Args[1]
+ mem := v.Args[2]
+ v.reset(OpARM64LoweredAtomicAdd32Variant)
+ v.AddArg(ptr)
+ v.AddArg(val)
+ v.AddArg(mem)
+ return true
+ }
+}
func rewriteValueARM64_OpAtomicAdd64_0(v *Value) bool {
// match: (AtomicAdd64 ptr val mem)
// cond:
return true
}
}
+func rewriteValueARM64_OpAtomicAdd64Variant_0(v *Value) bool {
+ // match: (AtomicAdd64Variant ptr val mem)
+ // cond:
+ // result: (LoweredAtomicAdd64Variant ptr val mem)
+ for {
+ _ = v.Args[2]
+ ptr := v.Args[0]
+ val := v.Args[1]
+ mem := v.Args[2]
+ v.reset(OpARM64LoweredAtomicAdd64Variant)
+ v.AddArg(ptr)
+ v.AddArg(val)
+ v.AddArg(mem)
+ return true
+ }
+}
func rewriteValueARM64_OpAtomicAnd8_0(v *Value) bool {
b := v.Block
_ = b
AHVC
AIC
AISB
+ ALDADDALD
+ ALDADDALW
ALDADDB
ALDADDH
ALDADDW
"HVC",
"IC",
"ISB",
+ "LDADDALD",
+ "LDADDALW",
"LDADDB",
"LDADDH",
"LDADDW",
oprangeset(ASWPB, t)
oprangeset(ASWPH, t)
oprangeset(ASWPW, t)
+ oprangeset(ALDADDALD, t)
+ oprangeset(ALDADDALW, t)
oprangeset(ALDADDB, t)
oprangeset(ALDADDH, t)
oprangeset(ALDADDW, t)
rt := p.RegTo2
rb := p.To.Reg
switch p.As {
- case ASWPD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
+ case ASWPD, ALDADDALD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
o1 = 3 << 30
- case ASWPW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
+ case ASWPW, ALDADDALW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
o1 = 2 << 30
case ASWPH, ALDADDH, ALDANDH, ALDEORH, ALDORH: // 16-bit
o1 = 1 << 30
switch p.As {
case ASWPD, ASWPW, ASWPH, ASWPB:
o1 |= 0x20 << 10
- case ALDADDD, ALDADDW, ALDADDH, ALDADDB:
+ case ALDADDALD, ALDADDALW, ALDADDD, ALDADDW, ALDADDH, ALDADDB:
o1 |= 0x00 << 10
case ALDANDD, ALDANDW, ALDANDH, ALDANDB:
o1 |= 0x04 << 10
case ALDORD, ALDORW, ALDORH, ALDORB:
o1 |= 0x0c << 10
}
+ switch p.As {
+ case ALDADDALD, ALDADDALW:
+ o1 |= 3 << 22
+ }
o1 |= 0x1c1<<21 | uint32(rs&31)<<16 | uint32(rb&31)<<5 | uint32(rt&31)
case 50: /* sys/sysl */
atomic.Store(&x, 0)
}
}
+
+func BenchmarkXadd(b *testing.B) {
+ var x uint32
+ ptr := &x
+ b.RunParallel(func(pb *testing.PB) {
+ for pb.Next() {
+ atomic.Xadd(ptr, 1)
+ }
+ })
+}
+
+func BenchmarkXadd64(b *testing.B) {
+ var x uint64
+ ptr := &x
+ b.RunParallel(func(pb *testing.PB) {
+ for pb.Next() {
+ atomic.Xadd64(ptr, 1)
+ }
+ })
+}
support_popcnt = cpu.X86.HasPOPCNT
support_sse2 = cpu.X86.HasSSE2
support_sse41 = cpu.X86.HasSSE41
+
+ arm64_support_atomics = cpu.ARM64.HasATOMICS
}
// The bootstrap sequence is:
processorVersionInfo uint32
isIntel bool
lfenceBeforeRdtsc bool
+
+ // Set in runtime.cpuinit.
support_erms bool
support_popcnt bool
support_sse2 bool
support_sse41 bool
+ arm64_support_atomics bool
goarm uint8 // set by cmd/link on arm systems
framepointer_enabled bool // set by cmd/link