From c18ff292959f18965ab6fa47d5dc7aeea1b2374f Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 25 Jun 2024 14:56:11 -0700 Subject: [PATCH] cmd/compile: make sync/atomic AND/OR operations intrinsic on amd64 Update #61395 Change-Id: I59a950f48efc587dfdffce00e2f4f3ab99d8df00 Reviewed-on: https://go-review.googlesource.com/c/go/+/594738 LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Cherry Mui Reviewed-by: Nicolas Hillegeer --- src/cmd/compile/internal/amd64/ssa.go | 57 ++++++- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 8 +- src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 14 +- src/cmd/compile/internal/ssa/opGen.go | 130 ++++++++++++++++ src/cmd/compile/internal/ssa/regalloc.go | 11 +- src/cmd/compile/internal/ssa/rewriteAMD64.go | 140 ++++++++++++++++++ src/cmd/compile/internal/ssagen/ssa.go | 64 ++++++-- test/codegen/atomics.go | 68 +++++++++ 8 files changed, 478 insertions(+), 14 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index ab762c24f6..61f1c88a71 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1286,7 +1286,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p = s.Prog(x86.ASETEQ) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg0() - case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock: + case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock: + // Atomic memory operations that don't need to return the old value. s.Prog(x86.ALOCK) p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG @@ -1294,6 +1295,60 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Type = obj.TYPE_MEM p.To.Reg = v.Args[0].Reg() ssagen.AddAux(&p.To, v) + case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32: + // Atomic memory operations that need to return the old value. + // We need to do these with compare-and-exchange to get access to the old value. + // loop: + // MOVQ mask, tmp + // MOVQ (addr), AX + // ANDQ AX, tmp + // LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against + // JNE loop + // : result in AX + mov := x86.AMOVQ + op := x86.AANDQ + cmpxchg := x86.ACMPXCHGQ + switch v.Op { + case ssa.OpAMD64LoweredAtomicOr64: + op = x86.AORQ + case ssa.OpAMD64LoweredAtomicAnd32: + mov = x86.AMOVL + op = x86.AANDL + cmpxchg = x86.ACMPXCHGL + case ssa.OpAMD64LoweredAtomicOr32: + mov = x86.AMOVL + op = x86.AORL + cmpxchg = x86.ACMPXCHGL + } + addr := v.Args[0].Reg() + mask := v.Args[1].Reg() + tmp := v.RegTmp() + p1 := s.Prog(mov) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = mask + p1.To.Type = obj.TYPE_REG + p1.To.Reg = tmp + p2 := s.Prog(mov) + p2.From.Type = obj.TYPE_MEM + p2.From.Reg = addr + ssagen.AddAux(&p2.From, v) + p2.To.Type = obj.TYPE_REG + p2.To.Reg = x86.REG_AX + p3 := s.Prog(op) + p3.From.Type = obj.TYPE_REG + p3.From.Reg = x86.REG_AX + p3.To.Type = obj.TYPE_REG + p3.To.Reg = tmp + s.Prog(x86.ALOCK) + p5 := s.Prog(cmpxchg) + p5.From.Type = obj.TYPE_REG + p5.From.Reg = tmp + p5.To.Type = obj.TYPE_MEM + p5.To.Reg = addr + ssagen.AddAux(&p5.To, v) + p6 := s.Prog(x86.AJNE) + p6.To.Type = obj.TYPE_BRANCH + p6.To.SetTarget(p1) case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 0f7b0bb6d8..d8bdf6b17e 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -578,12 +578,15 @@ (AtomicCompareAndSwap32 ptr old new_ mem) => (CMPXCHGLlock ptr old new_ mem) (AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem) -// Atomic memory updates. +// Atomic memory logical operations (old style). (AtomicAnd8 ptr val mem) => (ANDBlock ptr val mem) (AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem) (AtomicOr8 ptr val mem) => (ORBlock ptr val mem) (AtomicOr32 ptr val mem) => (ORLlock ptr val mem) +// Atomic memory logical operations (new style). +(Atomic(And64|And32|Or64|Or32)value ptr val mem) => (LoweredAtomic(And64|And32|Or64|Or32) ptr val mem) + // Write barrier. (WB ...) => (LoweredWB ...) @@ -1697,3 +1700,6 @@ ((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVQconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem)) ((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem)) ((SHL|SHR|SAR)XLload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Lconst [int8(c&31)] (MOVLload [off] {sym} ptr mem)) + +// Convert atomic logical operations to easier ones if we don't use the result. +(Select1 a:(LoweredAtomic(And64|And32|Or64|Or32) ptr val mem)) && a.Uses == 1 && clobber(a) => ((ANDQ|ANDL|ORQ|ORL)lock ptr val mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 606171947b..3440c43532 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -152,6 +152,7 @@ func init() { gpstoreconstidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}} gpstorexchg = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: []regMask{gp}} cmpxchg = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax} + atomicLogic = regInfo{inputs: []regMask{gp &^ ax, gp &^ ax, 0}, outputs: []regMask{ax, 0}} fp01 = regInfo{inputs: nil, outputs: fponly} fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} @@ -1040,11 +1041,22 @@ func init() { {name: "CMPXCHGLlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, {name: "CMPXCHGQlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, - // Atomic memory updates. + // Atomic memory updates using logical operations. + // Old style that just returns the memory state. {name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1 {name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1 + {name: "ANDQlock", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1 {name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1 {name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1 + {name: "ORQlock", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1 + + // Atomic memory updates using logical operations. + // *(arg0+auxint+aux) op= arg1. arg2=mem. + // New style that returns a tuple of . + {name: "LoweredAtomicAnd64", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ANDQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true}, + {name: "LoweredAtomicAnd32", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true}, + {name: "LoweredAtomicOr64", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ORQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true}, + {name: "LoweredAtomicOr32", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true}, // Prefetch instructions // Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9c464f6a1f..91728da80d 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1071,8 +1071,14 @@ const ( OpAMD64CMPXCHGQlock OpAMD64ANDBlock OpAMD64ANDLlock + OpAMD64ANDQlock OpAMD64ORBlock OpAMD64ORLlock + OpAMD64ORQlock + OpAMD64LoweredAtomicAnd64 + OpAMD64LoweredAtomicAnd32 + OpAMD64LoweredAtomicOr64 + OpAMD64LoweredAtomicOr32 OpAMD64PrefetchT0 OpAMD64PrefetchNTA OpAMD64ANDNQ @@ -14128,6 +14134,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "ANDQlock", + auxType: auxSymOff, + argLen: 3, + clobberFlags: true, + faultOnNilArg0: true, + hasSideEffects: true, + symEffect: SymRdWr, + asm: x86.AANDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 49151}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 + {0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB + }, + }, + }, { name: "ORBlock", auxType: auxSymOff, @@ -14160,6 +14182,114 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "ORQlock", + auxType: auxSymOff, + argLen: 3, + clobberFlags: true, + faultOnNilArg0: true, + hasSideEffects: true, + symEffect: SymRdWr, + asm: x86.AORQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 49151}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 + {0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB + }, + }, + }, + { + name: "LoweredAtomicAnd64", + auxType: auxSymOff, + argLen: 3, + resultNotInArgs: true, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + hasSideEffects: true, + unsafePoint: true, + symEffect: SymRdWr, + asm: x86.AANDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 1}, // AX + }, + }, + }, + { + name: "LoweredAtomicAnd32", + auxType: auxSymOff, + argLen: 3, + resultNotInArgs: true, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + hasSideEffects: true, + unsafePoint: true, + symEffect: SymRdWr, + asm: x86.AANDL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 1}, // AX + }, + }, + }, + { + name: "LoweredAtomicOr64", + auxType: auxSymOff, + argLen: 3, + resultNotInArgs: true, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + hasSideEffects: true, + unsafePoint: true, + symEffect: SymRdWr, + asm: x86.AORQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 1}, // AX + }, + }, + }, + { + name: "LoweredAtomicOr32", + auxType: auxSymOff, + argLen: 3, + resultNotInArgs: true, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + hasSideEffects: true, + unsafePoint: true, + symEffect: SymRdWr, + asm: x86.AORL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49134}, // CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 1}, // AX + }, + }, + }, { name: "PrefetchT0", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index 2325b9ee45..68f90e4a50 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -1612,8 +1612,14 @@ func (s *regAllocState) regalloc(f *Func) { // allocate it after all the input registers, but before // the input registers are freed via advanceUses below. // (Not all instructions need that distinct part, but it is conservative.) + // We also ensure it is not any of the single-choice output registers. if opcodeTable[v.Op].needIntTemp { m := s.allocatable & s.f.Config.gpRegMask + for _, out := range regspec.outputs { + if countRegs(out.regs) == 1 { + m &^= out.regs + } + } if m&^desired.avoid&^s.nospill != 0 { m &^= desired.avoid } @@ -1651,9 +1657,12 @@ func (s *regAllocState) regalloc(f *Func) { used |= regMask(1) << tmpReg } for _, out := range regspec.outputs { + if out.regs == 0 { + continue + } mask := out.regs & s.allocatable &^ used if mask == 0 { - continue + s.f.Fatalf("can't find any output register %s", v.LongString()) } if opcodeTable[v.Op].resultInArg0 && out.idx == 0 { if !opcodeTable[v.Op].commutative { diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 2670ba91b8..b3a644cbed 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -573,6 +573,10 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAtomicAdd64(v) case OpAtomicAnd32: return rewriteValueAMD64_OpAtomicAnd32(v) + case OpAtomicAnd32value: + return rewriteValueAMD64_OpAtomicAnd32value(v) + case OpAtomicAnd64value: + return rewriteValueAMD64_OpAtomicAnd64value(v) case OpAtomicAnd8: return rewriteValueAMD64_OpAtomicAnd8(v) case OpAtomicCompareAndSwap32: @@ -593,6 +597,10 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAtomicLoadPtr(v) case OpAtomicOr32: return rewriteValueAMD64_OpAtomicOr32(v) + case OpAtomicOr32value: + return rewriteValueAMD64_OpAtomicOr32value(v) + case OpAtomicOr64value: + return rewriteValueAMD64_OpAtomicOr64value(v) case OpAtomicOr8: return rewriteValueAMD64_OpAtomicOr8(v) case OpAtomicStore32: @@ -23873,6 +23881,36 @@ func rewriteValueAMD64_OpAtomicAnd32(v *Value) bool { return true } } +func rewriteValueAMD64_OpAtomicAnd32value(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (AtomicAnd32value ptr val mem) + // result: (LoweredAtomicAnd32 ptr val mem) + for { + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64LoweredAtomicAnd32) + v.AddArg3(ptr, val, mem) + return true + } +} +func rewriteValueAMD64_OpAtomicAnd64value(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (AtomicAnd64value ptr val mem) + // result: (LoweredAtomicAnd64 ptr val mem) + for { + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64LoweredAtomicAnd64) + v.AddArg3(ptr, val, mem) + return true + } +} func rewriteValueAMD64_OpAtomicAnd8(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -24019,6 +24057,36 @@ func rewriteValueAMD64_OpAtomicOr32(v *Value) bool { return true } } +func rewriteValueAMD64_OpAtomicOr32value(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (AtomicOr32value ptr val mem) + // result: (LoweredAtomicOr32 ptr val mem) + for { + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64LoweredAtomicOr32) + v.AddArg3(ptr, val, mem) + return true + } +} +func rewriteValueAMD64_OpAtomicOr64value(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (AtomicOr64value ptr val mem) + // result: (LoweredAtomicOr64 ptr val mem) + for { + ptr := v_0 + val := v_1 + mem := v_2 + v.reset(OpAMD64LoweredAtomicOr64) + v.AddArg3(ptr, val, mem) + return true + } +} func rewriteValueAMD64_OpAtomicOr8(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -29524,6 +29592,78 @@ func rewriteValueAMD64_OpSelect1(v *Value) bool { v.AddArg(tuple) return true } + // match: (Select1 a:(LoweredAtomicAnd64 ptr val mem)) + // cond: a.Uses == 1 && clobber(a) + // result: (ANDQlock ptr val mem) + for { + a := v_0 + if a.Op != OpAMD64LoweredAtomicAnd64 { + break + } + mem := a.Args[2] + ptr := a.Args[0] + val := a.Args[1] + if !(a.Uses == 1 && clobber(a)) { + break + } + v.reset(OpAMD64ANDQlock) + v.AddArg3(ptr, val, mem) + return true + } + // match: (Select1 a:(LoweredAtomicAnd32 ptr val mem)) + // cond: a.Uses == 1 && clobber(a) + // result: (ANDLlock ptr val mem) + for { + a := v_0 + if a.Op != OpAMD64LoweredAtomicAnd32 { + break + } + mem := a.Args[2] + ptr := a.Args[0] + val := a.Args[1] + if !(a.Uses == 1 && clobber(a)) { + break + } + v.reset(OpAMD64ANDLlock) + v.AddArg3(ptr, val, mem) + return true + } + // match: (Select1 a:(LoweredAtomicOr64 ptr val mem)) + // cond: a.Uses == 1 && clobber(a) + // result: (ORQlock ptr val mem) + for { + a := v_0 + if a.Op != OpAMD64LoweredAtomicOr64 { + break + } + mem := a.Args[2] + ptr := a.Args[0] + val := a.Args[1] + if !(a.Uses == 1 && clobber(a)) { + break + } + v.reset(OpAMD64ORQlock) + v.AddArg3(ptr, val, mem) + return true + } + // match: (Select1 a:(LoweredAtomicOr32 ptr val mem)) + // cond: a.Uses == 1 && clobber(a) + // result: (ORLlock ptr val mem) + for { + a := v_0 + if a.Op != OpAMD64LoweredAtomicOr32 { + break + } + mem := a.Args[2] + ptr := a.Args[0] + val := a.Args[1] + if !(a.Uses == 1 && clobber(a)) { + break + } + v.reset(OpAMD64ORLlock) + v.AddArg3(ptr, val, mem) + return true + } return false } func rewriteValueAMD64_OpSelectN(v *Value) bool { diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 98259f43ce..765f4c2e98 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -4164,6 +4164,13 @@ func (s *state) sfcall(op ssa.Op, args ...*ssa.Value) (*ssa.Value, bool) { return nil, false } +// split breaks up a tuple-typed value into its 2 parts. +func (s *state) split(v *ssa.Value) (*ssa.Value, *ssa.Value) { + p0 := s.newValue1(ssa.OpSelect0, v.Type.FieldType(0), v) + p1 := s.newValue1(ssa.OpSelect1, v.Type.FieldType(1), v) + return p0, p1 +} + var intrinsics map[intrinsicKey]intrinsicBuilder // An intrinsicBuilder converts a call node n into an ssa value that @@ -4531,6 +4538,7 @@ func InitTables() { makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64), sys.ARM64) + // Old-style atomic logical operation API (all supported archs except arm64). addF("internal/runtime/atomic", "And8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem()) @@ -4556,6 +4564,8 @@ func InitTables() { }, sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) + // arm64 always uses the new-style atomic logical operations, for both the + // old and new style API. addF("internal/runtime/atomic", "And8", makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64), sys.ARM64) @@ -4581,6 +4591,40 @@ func InitTables() { makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), sys.ARM64) + // New-style atomic logical operations, which return the old memory value. + addF("internal/runtime/atomic", "And64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) + p0, p1 := s.split(v) + s.vars[memVar] = p1 + return p0 + }, + sys.AMD64) + addF("internal/runtime/atomic", "And32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) + p0, p1 := s.split(v) + s.vars[memVar] = p1 + return p0 + }, + sys.AMD64) + addF("internal/runtime/atomic", "Or64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) + p0, p1 := s.split(v) + s.vars[memVar] = p1 + return p0 + }, + sys.AMD64) + addF("internal/runtime/atomic", "Or32", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) + p0, p1 := s.split(v) + s.vars[memVar] = p1 + return p0 + }, + sys.AMD64) + // Aliases for atomic load operations alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...) alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...) @@ -5108,16 +5152,16 @@ func InitTables() { alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...) alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...) - alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64) - alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64) - alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64) - alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64) - alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64) - alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64) - alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64) - alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64) - alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64) - alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64) + alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) + alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64) /******** math/big ********/ alias("math/big", "mulWW", "math/bits", "Mul64", p8...) diff --git a/test/codegen/atomics.go b/test/codegen/atomics.go index feaa31b9c1..14024dcd83 100644 --- a/test/codegen/atomics.go +++ b/test/codegen/atomics.go @@ -22,6 +22,74 @@ func (c *Counter) Increment() { // arm64/v8.1:"LDADDALW" // arm64/v8.0:".*arm64HasATOMICS" // arm64/v8.1:-".*arm64HasATOMICS" + // amd64:"LOCK",-"CMPXCHG" atomic.AddInt32(&c.count, 1) } +func atomicLogical64(x *atomic.Uint64) uint64 { + var r uint64 + + // arm64/v8.0:"LDCLRALD" + // arm64/v8.1:"LDCLRALD" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // On amd64, make sure we use LOCK+AND instead of CMPXCHG when we don't use the result. + // amd64:"LOCK",-"CMPXCHGQ" + x.And(11) + // arm64/v8.0:"LDCLRALD" + // arm64/v8.1:"LDCLRALD" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // amd64:"LOCK","CMPXCHGQ" + r += x.And(22) + + // arm64/v8.0:"LDORALD" + // arm64/v8.1:"LDORALD" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // On amd64, make sure we use LOCK+OR instead of CMPXCHG when we don't use the result. + // amd64:"LOCK",-"CMPXCHGQ" + x.Or(33) + // arm64/v8.0:"LDORALD" + // arm64/v8.1:"LDORALD" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // amd64:"LOCK","CMPXCHGQ" + r += x.Or(44) + + return r +} + +func atomicLogical32(x *atomic.Uint32) uint32 { + var r uint32 + + // arm64/v8.0:"LDCLRALW" + // arm64/v8.1:"LDCLRALW" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // On amd64, make sure we use LOCK+AND instead of CMPXCHG when we don't use the result. + // amd64:"LOCK",-"CMPXCHGL" + x.And(11) + // arm64/v8.0:"LDCLRALW" + // arm64/v8.1:"LDCLRALW" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // amd64:"LOCK","CMPXCHGL" + r += x.And(22) + + // arm64/v8.0:"LDORALW" + // arm64/v8.1:"LDORALW" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // On amd64, make sure we use LOCK+OR instead of CMPXCHG when we don't use the result. + // amd64:"LOCK",-"CMPXCHGL" + x.Or(33) + // arm64/v8.0:"LDORALW" + // arm64/v8.1:"LDORALW" + // arm64/v8.0:".*arm64HasATOMICS" + // arm64/v8.1:-".*arm64HasATOMICS" + // amd64:"LOCK","CMPXCHGL" + r += x.Or(44) + + return r +} -- 2.48.1