From 44cabb802af220388aa1d74776e076e75c53aeb0 Mon Sep 17 00:00:00 2001 From: David Chase Date: Wed, 28 Sep 2022 17:10:59 -0400 Subject: [PATCH] cmd/compile: if GOGC is not set, temporarily boost it for rapid starting heap growth Benchmarking suggests about a 14-17% reduction in user build time, about 3.5-7.8% reduction for wall time. This helps most builds because small packages are common. Latest benchmarks (after the last round of improvement): (12 processors) https://perf.golang.org/search?q=upload:20221102.20 (GOMAXPROCS=2) https://perf.golang.org/search?q=upload:20221103.1 (48 processors) https://perf.golang.org/search?q=upload:20221102.19 (The number of compiler workers is capped at min(4, GOMAXPROCS)) An earlier, similar version of this CL at one point observed a 27% reduction in user build time (building 40+ benchmarks, 20 times), but the current form is judged to be the most reliable; it may be profitable to tweak the numbers slightly later, and/or to adjust the number of compiler workers. We've talked about doing this in the past, the "new"(ish) metrics package makes it a more tractable proposition. The method here is: 1. If os.Getenv("GOGC") is empty, then increase GOGC to a large value, calculated to grow the heap to 32 + 4 * compile_parallelism before a GC occurs (e.g., on a >= 4 processor box, 64M). In practice, sometimes GC occurs before that, but this still results in fewer GCs and saved time. This is "heap goal". 2. Use a finalizer to approximately detect when GC occurs, and use runtime metrics to track progress towards the goal heap size, readjusting GOGC to retarget it as necessary. Reset GOGC to 100 when the heap is "close enough" to the goal. One feared failure mode of doing this is that the finalizer will be slow to run and the heap will grow exceptionally large before GOGC is reset; I monitored the heap size at reset and exit across several boxes with a variety of processor counts and extra noise (including several builds in parallel, including a laptop with a busy many-tabs browser running) and overshoot effectively does not occur. In some cases the compiler's heap grows so rapidly that estimated live exceeds the GC goal, but this is not delayed-finalizer overshoot; the compiler is just using that much memory. In a small number of cases (3% of GCs in make.bash) the new goal is larger than predicted by as much as 38%, so check for that and redo the adjustment. I considered instead using the maximum heap size limit + GC-detecting-finalizer + reset instead, but to me that seemed like it might have a worse bad-case outcome; if the reset is delayed, it's possible the GC would start running frequently, making it harder to run the finalizer, reach 50% utilization, and the extra GCs would lose the advantage. This might also perform badly in the case that a rapidly growing heap outruns goal. In practice, this sort of overshoot hasn't been observed, and a goal of 64M is small enough to tolerate plenty of overshoot anyway. This version of the CL includes a comment urging anyone who sees the code and thinks it would work for them, to update a bug (to be created if the CL is approved) with information about their situation/experience, so that we may consider creating some more official and reliable way of obtaining the same result. Change-Id: I45df1c927c1a7d7503ade1abd1a3300e27516633 Reviewed-on: https://go-review.googlesource.com/c/go/+/436235 Run-TryBot: David Chase Reviewed-by: Keith Randall TryBot-Result: Gopher Robot Reviewed-by: Keith Randall --- src/cmd/compile/internal/base/base.go | 194 +++++++++++++++++++++++++ src/cmd/compile/internal/base/debug.go | 1 + src/cmd/compile/internal/gc/main.go | 6 + 3 files changed, 201 insertions(+) diff --git a/src/cmd/compile/internal/base/base.go b/src/cmd/compile/internal/base/base.go index 39ce8e66f7..521600b404 100644 --- a/src/cmd/compile/internal/base/base.go +++ b/src/cmd/compile/internal/base/base.go @@ -5,7 +5,11 @@ package base import ( + "fmt" "os" + "runtime" + "runtime/debug" + "runtime/metrics" ) var atExitFuncs []func() @@ -26,6 +30,196 @@ func Exit(code int) { // To enable tracing support (-t flag), set EnableTrace to true. const EnableTrace = false +// forEachGC calls fn each GC cycle until it returns false. +func forEachGC(fn func() bool) { + type T [32]byte // large enough to avoid runtime's tiny object allocator + + var finalizer func(*T) + finalizer = func(p *T) { + if fn() { + runtime.SetFinalizer(p, finalizer) + } + } + + finalizer(new(T)) +} + +// AdjustStartingHeap modifies GOGC so that GC should not occur until the heap +// grows to the requested size. This is intended but not promised, though it +// is true-mostly, depending on when the adjustment occurs and on the +// compiler's input and behavior. Once this size is approximately reached +// GOGC is reset to 100; subsequent GCs may reduce the heap below the requested +// size, but this function does not affect that. +// +// -d=gcadjust=1 enables logging of GOGC adjustment events. +// +// NOTE: If you think this code would help startup time in your own +// application and you decide to use it, please benchmark first to see if it +// actually works for you (it may not: the Go compiler is not typical), and +// whatever the outcome, please leave a comment on bug #56546. This code +// uses supported interfaces, but depends more than we like on +// current+observed behavior of the garbage collector, so if many people need +// this feature, we should consider/propose a better way to accomplish it. +func AdjustStartingHeap(requestedHeapGoal uint64) { + logHeapTweaks := Debug.GCAdjust == 1 + mp := runtime.GOMAXPROCS(0) + gcConcurrency := Flag.LowerC + + const ( + goal = "/gc/heap/goal:bytes" + count = "/gc/cycles/total:gc-cycles" + allocs = "/gc/heap/allocs:bytes" + frees = "/gc/heap/frees:bytes" + ) + + sample := []metrics.Sample{{Name: goal}, {Name: count}, {Name: allocs}, {Name: frees}} + const ( + GOAL = 0 + COUNT = 1 + ALLOCS = 2 + FREES = 3 + ) + + // Assumptions and observations of Go's garbage collector, as of Go 1.17-1.20: + + // - the initial heap goal is 4M, by fiat. It is possible for Go to start + // with a heap as small as 512k, so this may change in the future. + + // - except for the first heap goal, heap goal is a function of + // observed-live at the previous GC and current GOGC. After the first + // GC, adjusting GOGC immediately updates GOGC; before the first GC, + // adjusting GOGC does not modify goal (but the change takes effect after + // the first GC). + + // - the before/after first GC behavior is not guaranteed anywhere, it's + // just behavior, and it's a bad idea to rely on it. + + // - we don't know exactly when GC will run, even after we adjust GOGC; the + // first GC may not have happened yet, may have already happened, or may + // be currently in progress, and GCs can start for several reasons. + + // - forEachGC above will run the provided function at some delay after each + // GC's mark phase terminates; finalizers are run after marking as the + // spans containing finalizable objects are swept, driven by GC + // background activity and allocation demand. + + // - "live at last GC" is not available through the current metrics + // interface. Instead, live is estimated by knowing the adjusted value of + // GOGC and the new heap goal following a GC (this requires knowing that + // at least one GC has occurred): + // estLive = 100 * newGoal / (100 + currentGogc)] + // this new value of GOGC + // newGogc = 100*requestedHeapGoal/estLive - 100 + // will result in the desired goal. The logging code checks that the + // resulting goal is correct. + + // There's a small risk that the finalizer will be slow to run after a GC + // that expands the goal to a huge value, and that this will lead to + // out-of-memory. This doesn't seem to happen; in experiments on a variety + // of machines with a variety of extra loads to disrupt scheduling, the + // worst overshoot observed was 50% past requestedHeapGoal. + + metrics.Read(sample) + for _, s := range sample { + if s.Value.Kind() == metrics.KindBad { + // Just return, a slightly slower compilation is a tolerable outcome. + if logHeapTweaks { + fmt.Fprintf(os.Stderr, "GCAdjust: Regret unexpected KindBad for metric %s\n", s.Name) + } + return + } + } + + // Tinker with GOGC to make the heap grow rapidly at first. + currentGoal := sample[GOAL].Value.Uint64() // Believe this will be 4MByte or less, perhaps 512k + myGogc := 100 * requestedHeapGoal / currentGoal + if myGogc <= 150 { + return + } + + if logHeapTweaks { + sample := append([]metrics.Sample(nil), sample...) // avoid races with GC callback + AtExit(func() { + metrics.Read(sample) + goal := sample[GOAL].Value.Uint64() + count := sample[COUNT].Value.Uint64() + oldGogc := debug.SetGCPercent(100) + if oldGogc == 100 { + fmt.Fprintf(os.Stderr, "GCAdjust: AtExit goal %d gogc %d count %d maxprocs %d gcConcurrency %d\n", + goal, oldGogc, count, mp, gcConcurrency) + } else { + inUse := sample[ALLOCS].Value.Uint64() - sample[FREES].Value.Uint64() + overPct := 100 * (int(inUse) - int(requestedHeapGoal)) / int(requestedHeapGoal) + fmt.Fprintf(os.Stderr, "GCAdjust: AtExit goal %d gogc %d count %d maxprocs %d gcConcurrency %d overPct %d\n", + goal, oldGogc, count, mp, gcConcurrency, overPct) + + } + }) + } + + debug.SetGCPercent(int(myGogc)) + + adjustFunc := func() bool { + + metrics.Read(sample) + goal := sample[GOAL].Value.Uint64() + count := sample[COUNT].Value.Uint64() + + if goal <= requestedHeapGoal { // Stay the course + if logHeapTweaks { + fmt.Fprintf(os.Stderr, "GCAdjust: Reuse GOGC adjust, current goal %d, count is %d, current gogc %d\n", + goal, count, myGogc) + } + return true + } + + // Believe goal has been adjusted upwards, else it would be less-than-or-equal than requestedHeapGoal + calcLive := 100 * goal / (100 + myGogc) + + if 2*calcLive < requestedHeapGoal { // calcLive can exceed requestedHeapGoal! + myGogc = 100*requestedHeapGoal/calcLive - 100 + + if myGogc > 125 { + // Not done growing the heap. + oldGogc := debug.SetGCPercent(int(myGogc)) + + if logHeapTweaks { + // Check that the new goal looks right + inUse := sample[ALLOCS].Value.Uint64() - sample[FREES].Value.Uint64() + metrics.Read(sample) + newGoal := sample[GOAL].Value.Uint64() + pctOff := 100 * (int64(newGoal) - int64(requestedHeapGoal)) / int64(requestedHeapGoal) + // Check that the new goal is close to requested. 3% of make.bash fails this test. Why, TBD. + if pctOff < 2 { + fmt.Fprintf(os.Stderr, "GCAdjust: Retry GOGC adjust, current goal %d, count is %d, gogc was %d, is now %d, calcLive %d pctOff %d\n", + goal, count, oldGogc, myGogc, calcLive, pctOff) + } else { + // The GC is being annoying and not giving us the goal that we requested, say more to help understand when/why. + fmt.Fprintf(os.Stderr, "GCAdjust: Retry GOGC adjust, current goal %d, count is %d, gogc was %d, is now %d, calcLive %d pctOff %d inUse %d\n", + goal, count, oldGogc, myGogc, calcLive, pctOff, inUse) + } + } + return true + } + } + + // In this case we're done boosting GOGC, set it to 100 and don't set a new finalizer. + oldGogc := debug.SetGCPercent(100) + // inUse helps estimate how late the finalizer ran; at the instant the previous GC ended, + // it was (in theory) equal to the previous GC's heap goal. In a growing heap it is + // expected to grow to the new heap goal. + inUse := sample[ALLOCS].Value.Uint64() - sample[FREES].Value.Uint64() + overPct := 100 * (int(inUse) - int(requestedHeapGoal)) / int(requestedHeapGoal) + if logHeapTweaks { + fmt.Fprintf(os.Stderr, "GCAdjust: Reset GOGC adjust, old goal %d, count is %d, gogc was %d, calcLive %d inUse %d overPct %d\n", + goal, count, oldGogc, calcLive, inUse, overPct) + } + return false + } + + forEachGC(adjustFunc) +} + func Compiling(pkgs []string) bool { if Ctxt.Pkgpath != "" { for _, p := range pkgs { diff --git a/src/cmd/compile/internal/base/debug.go b/src/cmd/compile/internal/base/debug.go index 25a5c8c98f..083fdf8fb1 100644 --- a/src/cmd/compile/internal/base/debug.go +++ b/src/cmd/compile/internal/base/debug.go @@ -26,6 +26,7 @@ type DebugFlags struct { DwarfInl int `help:"print information about DWARF inlined function creation"` Export int `help:"print export data"` Fmahash string `help:"hash value for use in debugging platform-dependent multiply-add use" concurrent:"ok"` + GCAdjust int `help:"log adjustments to GOGC" concurrent:"ok"` GCProg int `help:"print dump of GC programs"` Gossahash string `help:"hash value for use in debugging the compiler"` InlFuncsWithClosures int `help:"allow functions with closures to be inlined"` diff --git a/src/cmd/compile/internal/gc/main.go b/src/cmd/compile/internal/gc/main.go index 3055879a2f..4a37548ecc 100644 --- a/src/cmd/compile/internal/gc/main.go +++ b/src/cmd/compile/internal/gc/main.go @@ -75,6 +75,12 @@ func Main(archInit func(*ssagen.ArchInfo)) { base.DebugSSA = ssa.PhaseOption base.ParseFlags() + if os.Getenv("GOGC") == "" { // GOGC set disables starting heap adjustment + // More processors will use more heap, but assume that more memory is available. + // So 1 processor -> 40MB, 4 -> 64MB, 12 -> 128MB + base.AdjustStartingHeap(uint64(32+8*base.Flag.LowerC) << 20) + } + types.LocalPkg = types.NewPkg(base.Ctxt.Pkgpath, "") // pseudo-package, for scoping -- 2.48.1