From: Austin Clements Date: Thu, 26 Oct 2017 16:33:04 +0000 (-0400) Subject: cmd/compile: compiler support for buffered write barrier X-Git-Tag: go1.10beta1~538 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=7e343134d334f7317b342db19c3e90d1f3f200cc;p=gostls13.git cmd/compile: compiler support for buffered write barrier This CL implements the compiler support for calling the buffered write barrier added by the previous CL. Since the buffered write barrier is only implemented on amd64 right now, this still supports the old, eager write barrier as well. There's little overhead to supporting both and this way a few tests in test/fixedbugs that expect to have liveness maps at write barrier calls can easily opt-in to the old, eager barrier. This significantly improves the performance of the write barrier: name old time/op new time/op delta WriteBarrier-12 73.5ns ±20% 19.2ns ±27% -73.90% (p=0.000 n=19+18) It also reduces the size of binaries because the write barrier call is more compact: name old object-bytes new object-bytes delta Template 398k ± 0% 393k ± 0% -1.14% (p=0.008 n=5+5) Unicode 208k ± 0% 206k ± 0% -1.00% (p=0.008 n=5+5) GoTypes 1.18M ± 0% 1.15M ± 0% -2.00% (p=0.008 n=5+5) Compiler 4.05M ± 0% 3.88M ± 0% -4.26% (p=0.008 n=5+5) SSA 8.25M ± 0% 8.11M ± 0% -1.59% (p=0.008 n=5+5) Flate 228k ± 0% 224k ± 0% -1.83% (p=0.008 n=5+5) GoParser 295k ± 0% 284k ± 0% -3.62% (p=0.008 n=5+5) Reflect 1.00M ± 0% 0.99M ± 0% -0.70% (p=0.008 n=5+5) Tar 339k ± 0% 333k ± 0% -1.67% (p=0.008 n=5+5) XML 404k ± 0% 395k ± 0% -2.10% (p=0.008 n=5+5) [Geo mean] 704k 690k -2.00% name old exe-bytes new exe-bytes delta HelloSize 1.05M ± 0% 1.04M ± 0% -1.55% (p=0.008 n=5+5) https://perf.golang.org/search?q=upload:20171027.1 (Amusingly, this also reduces compiler allocations by 0.75%, which, combined with the better write barrier, speeds up the compiler overall by 2.10%. See the perf link.) It slightly improves the performance of most of the go1 benchmarks and improves the performance of the x/benchmarks: name old time/op new time/op delta BinaryTree17-12 2.40s ± 1% 2.47s ± 1% +2.69% (p=0.000 n=19+19) Fannkuch11-12 2.95s ± 0% 2.95s ± 0% +0.21% (p=0.000 n=20+19) FmtFprintfEmpty-12 41.8ns ± 4% 41.4ns ± 2% -1.03% (p=0.014 n=20+20) FmtFprintfString-12 68.7ns ± 2% 67.5ns ± 1% -1.75% (p=0.000 n=20+17) FmtFprintfInt-12 79.0ns ± 3% 77.1ns ± 1% -2.40% (p=0.000 n=19+17) FmtFprintfIntInt-12 127ns ± 1% 123ns ± 3% -3.42% (p=0.000 n=20+20) FmtFprintfPrefixedInt-12 152ns ± 1% 150ns ± 1% -1.02% (p=0.000 n=18+17) FmtFprintfFloat-12 211ns ± 1% 209ns ± 0% -0.99% (p=0.000 n=20+16) FmtManyArgs-12 500ns ± 0% 496ns ± 0% -0.73% (p=0.000 n=17+20) GobDecode-12 6.44ms ± 1% 6.53ms ± 0% +1.28% (p=0.000 n=20+19) GobEncode-12 5.46ms ± 0% 5.46ms ± 1% ~ (p=0.550 n=19+20) Gzip-12 220ms ± 1% 216ms ± 0% -1.75% (p=0.000 n=19+19) Gunzip-12 38.8ms ± 0% 38.6ms ± 0% -0.30% (p=0.000 n=18+19) HTTPClientServer-12 79.0µs ± 1% 78.2µs ± 1% -1.01% (p=0.000 n=20+20) JSONEncode-12 11.9ms ± 0% 11.9ms ± 0% -0.29% (p=0.000 n=20+19) JSONDecode-12 52.6ms ± 0% 52.2ms ± 0% -0.68% (p=0.000 n=19+20) Mandelbrot200-12 3.69ms ± 0% 3.68ms ± 0% -0.36% (p=0.000 n=20+20) GoParse-12 3.13ms ± 1% 3.18ms ± 1% +1.67% (p=0.000 n=19+20) RegexpMatchEasy0_32-12 73.2ns ± 1% 72.3ns ± 1% -1.19% (p=0.000 n=19+18) RegexpMatchEasy0_1K-12 241ns ± 0% 239ns ± 0% -0.83% (p=0.000 n=17+16) RegexpMatchEasy1_32-12 68.6ns ± 1% 69.0ns ± 1% +0.47% (p=0.015 n=18+16) RegexpMatchEasy1_1K-12 364ns ± 0% 361ns ± 0% -0.67% (p=0.000 n=16+17) RegexpMatchMedium_32-12 104ns ± 1% 103ns ± 1% -0.79% (p=0.001 n=20+15) RegexpMatchMedium_1K-12 33.8µs ± 3% 34.0µs ± 2% ~ (p=0.267 n=20+19) RegexpMatchHard_32-12 1.64µs ± 1% 1.62µs ± 2% -1.25% (p=0.000 n=19+18) RegexpMatchHard_1K-12 49.2µs ± 0% 48.7µs ± 1% -0.93% (p=0.000 n=19+18) Revcomp-12 391ms ± 5% 396ms ± 7% ~ (p=0.154 n=19+19) Template-12 63.1ms ± 0% 59.5ms ± 0% -5.76% (p=0.000 n=18+19) TimeParse-12 307ns ± 0% 306ns ± 0% -0.39% (p=0.000 n=19+17) TimeFormat-12 325ns ± 0% 323ns ± 0% -0.50% (p=0.000 n=19+19) [Geo mean] 47.3µs 46.9µs -0.67% https://perf.golang.org/search?q=upload:20171026.1 name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.25ms ± 1% 2.20ms ± 1% -2.31% (p=0.000 n=18+18) HTTP-12 12.6µs ± 0% 12.6µs ± 0% -0.72% (p=0.000 n=18+17) JSON-12 11.0ms ± 0% 11.0ms ± 1% -0.68% (p=0.000 n=17+19) https://perf.golang.org/search?q=upload:20171026.2 Updates #14951. Updates #22460. Change-Id: Id4c0932890a1d41020071bec73b8522b1367d3e7 Reviewed-on: https://go-review.googlesource.com/73712 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 97b01d20fa..c116336c7f 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -824,6 +824,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpAMD64LoweredWB: + p := s.Prog(obj.ACALL) + p.To.Type = obj.TYPE_MEM + p.To.Name = obj.NAME_EXTERN + p.To.Sym = v.Aux.(*obj.LSym) + case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL, ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL, ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL: diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index d590ff3c60..687a3a3240 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -468,7 +468,7 @@ var linuxAMD64Tests = []*asmTest{ *t = T2{} } `, - pos: []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.writebarrierptr\\(SB\\)"}, + pos: []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.(writebarrierptr|gcWriteBarrier)\\(SB\\)"}, }, // Rotate tests { diff --git a/src/cmd/compile/internal/gc/go.go b/src/cmd/compile/internal/gc/go.go index 663750c9f5..9b81600b60 100644 --- a/src/cmd/compile/internal/gc/go.go +++ b/src/cmd/compile/internal/gc/go.go @@ -287,6 +287,7 @@ var ( goschedguarded, writeBarrier, writebarrierptr, + gcWriteBarrier, typedmemmove, typedmemclr, Udiv *obj.LSym diff --git a/src/cmd/compile/internal/gc/main.go b/src/cmd/compile/internal/gc/main.go index d94dd08012..553b7907ca 100644 --- a/src/cmd/compile/internal/gc/main.go +++ b/src/cmd/compile/internal/gc/main.go @@ -44,6 +44,7 @@ var ( Debug_slice int Debug_vlog bool Debug_wb int + Debug_eagerwb int Debug_pctab string Debug_locationlist int Debug_typecheckinl int @@ -70,6 +71,7 @@ var debugtab = []struct { {"slice", "print information about slice compilation", &Debug_slice}, {"typeassert", "print information about type assertion inlining", &Debug_typeassert}, {"wb", "print information about write barriers", &Debug_wb}, + {"eagerwb", "use unbuffered write barrier", &Debug_eagerwb}, {"export", "print export data", &Debug_export}, {"pctab", "print named pc-value table", &Debug_pctab}, {"locationlists", "print information about DWARF location list creation", &Debug_locationlist}, @@ -400,6 +402,12 @@ func Main(archInit func(*Arch)) { Debug['l'] = 1 - Debug['l'] } + // The buffered write barrier is only implemented on amd64 + // right now. + if objabi.GOARCH != "amd64" { + Debug_eagerwb = 1 + } + trackScopes = flagDWARF && ((Debug['l'] == 0 && Debug['N'] != 0) || Ctxt.Flag_locationlists) Widthptr = thearch.LinkArch.PtrSize diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index be1dd8bd87..9eeeb35599 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -90,6 +90,7 @@ func initssaconfig() { goschedguarded = sysfunc("goschedguarded") writeBarrier = sysfunc("writeBarrier") writebarrierptr = sysfunc("writebarrierptr") + gcWriteBarrier = sysfunc("gcWriteBarrier") typedmemmove = sysfunc("typedmemmove") typedmemclr = sysfunc("typedmemclr") Udiv = sysfunc("udiv") @@ -5185,6 +5186,10 @@ func (e *ssafn) Debug_checknil() bool { return Debug_checknil != 0 } +func (e *ssafn) Debug_eagerwb() bool { + return Debug_eagerwb != 0 +} + func (e *ssafn) UseWriteBarrier() bool { return use_writebarrier } @@ -5197,6 +5202,8 @@ func (e *ssafn) Syslook(name string) *obj.LSym { return writeBarrier case "writebarrierptr": return writebarrierptr + case "gcWriteBarrier": + return gcWriteBarrier case "typedmemmove": return typedmemmove case "typedmemclr": diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go index de3aadbbe5..61ecac4e75 100644 --- a/src/cmd/compile/internal/ssa/config.go +++ b/src/cmd/compile/internal/ssa/config.go @@ -88,6 +88,7 @@ type Logger interface { // Forwards the Debug flags from gc Debug_checknil() bool + Debug_eagerwb() bool } type Frontend interface { diff --git a/src/cmd/compile/internal/ssa/export_test.go b/src/cmd/compile/internal/ssa/export_test.go index d1d6831eb3..28ae494505 100644 --- a/src/cmd/compile/internal/ssa/export_test.go +++ b/src/cmd/compile/internal/ssa/export_test.go @@ -134,6 +134,7 @@ func (d DummyFrontend) Log() bool { return true } func (d DummyFrontend) Fatalf(_ src.XPos, msg string, args ...interface{}) { d.t.Fatalf(msg, args...) } func (d DummyFrontend) Warnl(_ src.XPos, msg string, args ...interface{}) { d.t.Logf(msg, args...) } func (d DummyFrontend) Debug_checknil() bool { return false } +func (d DummyFrontend) Debug_eagerwb() bool { return false } var dummyTypes Types diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 522d8f13cc..c0d9dda386 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -551,6 +551,9 @@ (AtomicAnd8 ptr val mem) -> (ANDBlock ptr val mem) (AtomicOr8 ptr val mem) -> (ORBlock ptr val mem) +// Write barrier. +(WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem) + // *************************** // Above: lowering rules // Below: optimizations diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index 699554ab2e..a15d3e4519 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -561,6 +561,10 @@ func init() { //arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + // LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // It saves all GP registers if necessary, but may clobber others. + {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave ^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"}, + // MOVQconvert converts between pointers and integers. // We have a special op for this so as to not confuse GC // (particularly stack maps). It takes a memory arg so it diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go index 5ed0ce21b2..0ad582b046 100644 --- a/src/cmd/compile/internal/ssa/gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/gen/genericOps.go @@ -331,6 +331,12 @@ var genericOps = []opData{ {name: "MoveWB", argLength: 3, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=srcptr, arg2=mem, auxint=size, aux=type. Returns memory. {name: "ZeroWB", argLength: 2, typ: "Mem", aux: "TypSize"}, // arg0=destptr, arg1=mem, auxint=size, aux=type. Returns memory. + // WB invokes runtime.gcWriteBarrier. This is not a normal + // call: it takes arguments in registers, doesn't clobber + // general-purpose registers (the exact clobber set is + // arch-dependent), and is not a safe-point. + {name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier + // Function calls. Arguments to the call have already been written to the stack. // Return values appear on the stack. The method receiver, if any, is treated // as a phantom first argument. diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d8cae61588..e65cb1e7d6 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -668,6 +668,7 @@ const ( OpAMD64LoweredGetCallerPC OpAMD64LoweredGetCallerSP OpAMD64LoweredNilCheck + OpAMD64LoweredWB OpAMD64MOVQconvert OpAMD64MOVLconvert OpAMD64FlagEQ @@ -1923,6 +1924,7 @@ const ( OpStoreWB OpMoveWB OpZeroWB + OpWB OpClosureCall OpStaticCall OpInterCall @@ -8155,6 +8157,20 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "LoweredWB", + auxType: auxSym, + argLen: 3, + clobberFlags: true, + symEffect: SymNone, + reg: regInfo{ + inputs: []inputInfo{ + {0, 128}, // DI + {1, 1}, // AX + }, + clobbers: 4294901760, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + }, { name: "MOVQconvert", argLen: 2, @@ -23389,6 +23405,13 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "WB", + auxType: auxSym, + argLen: 3, + symEffect: SymNone, + generic: true, + }, { name: "ClosureCall", auxType: auxInt64, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 6fcc647102..df49aa5df7 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -905,6 +905,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpTrunc64to32_0(v) case OpTrunc64to8: return rewriteValueAMD64_OpTrunc64to8_0(v) + case OpWB: + return rewriteValueAMD64_OpWB_0(v) case OpXor16: return rewriteValueAMD64_OpXor16_0(v) case OpXor32: @@ -46955,6 +46957,24 @@ func rewriteValueAMD64_OpTrunc64to8_0(v *Value) bool { return true } } +func rewriteValueAMD64_OpWB_0(v *Value) bool { + // match: (WB {fn} destptr srcptr mem) + // cond: + // result: (LoweredWB {fn} destptr srcptr mem) + for { + fn := v.Aux + _ = v.Args[2] + destptr := v.Args[0] + srcptr := v.Args[1] + mem := v.Args[2] + v.reset(OpAMD64LoweredWB) + v.Aux = fn + v.AddArg(destptr) + v.AddArg(srcptr) + v.AddArg(mem) + return true + } +} func rewriteValueAMD64_OpXor16_0(v *Value) bool { // match: (Xor16 x y) // cond: diff --git a/src/cmd/compile/internal/ssa/writebarrier.go b/src/cmd/compile/internal/ssa/writebarrier.go index 60797158b3..b711d8d2bf 100644 --- a/src/cmd/compile/internal/ssa/writebarrier.go +++ b/src/cmd/compile/internal/ssa/writebarrier.go @@ -44,7 +44,7 @@ func writebarrier(f *Func) { } var sb, sp, wbaddr, const0 *Value - var writebarrierptr, typedmemmove, typedmemclr *obj.LSym + var writebarrierptr, typedmemmove, typedmemclr, gcWriteBarrier *obj.LSym var stores, after []*Value var sset *sparseSet var storeNumber []int32 @@ -97,6 +97,9 @@ func writebarrier(f *Func) { wbsym := f.fe.Syslook("writeBarrier") wbaddr = f.Entry.NewValue1A(initpos, OpAddr, f.Config.Types.UInt32Ptr, wbsym, sb) writebarrierptr = f.fe.Syslook("writebarrierptr") + if !f.fe.Debug_eagerwb() { + gcWriteBarrier = f.fe.Syslook("gcWriteBarrier") + } typedmemmove = f.fe.Syslook("typedmemmove") typedmemclr = f.fe.Syslook("typedmemclr") const0 = f.ConstInt32(initpos, f.Config.Types.UInt32, 0) @@ -170,6 +173,15 @@ func writebarrier(f *Func) { b.Succs = b.Succs[:0] b.AddEdgeTo(bThen) b.AddEdgeTo(bElse) + // TODO: For OpStoreWB and the buffered write barrier, + // we could move the write out of the write barrier, + // which would lead to fewer branches. We could do + // something similar to OpZeroWB, since the runtime + // could provide just the barrier half and then we + // could unconditionally do an OpZero (which could + // also generate better zeroing code). OpMoveWB is + // trickier and would require changing how + // cgoCheckMemmove works. bThen.AddEdgeTo(bEnd) bElse.AddEdgeTo(bEnd) @@ -205,7 +217,11 @@ func writebarrier(f *Func) { switch w.Op { case OpStoreWB, OpMoveWB, OpZeroWB: volatile := w.Op == OpMoveWB && isVolatile(val) - memThen = wbcall(pos, bThen, fn, typ, ptr, val, memThen, sp, sb, volatile) + if w.Op == OpStoreWB && !f.fe.Debug_eagerwb() { + memThen = bThen.NewValue3A(pos, OpWB, types.TypeMem, gcWriteBarrier, ptr, val, memThen) + } else { + memThen = wbcall(pos, bThen, fn, typ, ptr, val, memThen, sp, sb, volatile) + } case OpVarDef, OpVarLive, OpVarKill: memThen = bThen.NewValue1A(pos, w.Op, types.TypeMem, w.Aux, memThen) } diff --git a/test/fixedbugs/issue15747.go b/test/fixedbugs/issue15747.go index 836d0eab25..decabc754e 100644 --- a/test/fixedbugs/issue15747.go +++ b/test/fixedbugs/issue15747.go @@ -1,4 +1,4 @@ -// errorcheck -0 -live +// errorcheck -0 -live -d=eagerwb // Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style @@ -7,6 +7,10 @@ // Issue 15747: liveness analysis was marking heap-escaped params live too much, // and worse was using the wrong bitmap bits to do so. +// TODO(austin): This expects function calls to the write barrier, so +// we enable the legacy eager write barrier. Fix this once the +// buffered write barrier works on all arches. + package p var global *[]byte diff --git a/test/fixedbugs/issue20250.go b/test/fixedbugs/issue20250.go index 28c85ff130..4a8fe30935 100644 --- a/test/fixedbugs/issue20250.go +++ b/test/fixedbugs/issue20250.go @@ -1,4 +1,4 @@ -// errorcheck -0 -live -d=compilelater +// errorcheck -0 -live -d=compilelater,eagerwb // Copyright 2017 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style @@ -8,6 +8,10 @@ // due to propagation of addrtaken to outer variables for // closure variables. +// TODO(austin): This expects function calls to the write barrier, so +// we enable the legacy eager write barrier. Fix this once the +// buffered write barrier works on all arches. + package p type T struct {