From: Raul Silvera Date: Mon, 14 Sep 2015 21:03:45 +0000 (-0700) Subject: pprof: improve sampling for heap profiling X-Git-Tag: go1.6beta1~912 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=27ee719fb32b47b9bc59921e457f4b1e7f767968;p=gostls13.git pprof: improve sampling for heap profiling The current heap sampling introduces some bias that interferes with unsampling, producing unexpected heap profiles. The solution is to use a Poisson process to generate the sampling points, using the formulas described at https://en.wikipedia.org/wiki/Poisson_process This fixes #12620 Change-Id: If2400809ed3c41de504dd6cff06be14e476ff96c Reviewed-on: https://go-review.googlesource.com/14590 Reviewed-by: Keith Randall Reviewed-by: Minux Ma Run-TryBot: Minux Ma TryBot-Result: Gobot Gobot --- diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index 06ffbf6191..d7e43f4fe2 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -26,6 +26,8 @@ var Xadduintptr = xadduintptr var FuncPC = funcPC +var Fastlog2 = fastlog2 + type LFNode struct { Next uint64 Pushcnt uintptr diff --git a/src/runtime/fastlog2.go b/src/runtime/fastlog2.go new file mode 100644 index 0000000000..b22e8259ad --- /dev/null +++ b/src/runtime/fastlog2.go @@ -0,0 +1,33 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import "unsafe" + +// fastlog2 implements a fast approximation to the base 2 log of a +// float64. This is used to compute a geometric distribution for heap +// sampling, without introducing dependences into package math. This +// uses a very rough approximation using the float64 exponent and the +// first 25 bits of the mantissa. The top 5 bits of the mantissa are +// used to load limits from a table of constants and the rest are used +// to scale linearly between them. +func fastlog2(x float64) float64 { + const fastlogScaleBits = 20 + const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits) + + xBits := float64bits(x) + // Extract the exponent from the IEEE float64, and index a constant + // table with the first 10 bits from the mantissa. + xExp := int64((xBits>>52)&0x7FF) - 1023 + xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits) + xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits) + + low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1] + return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio +} + +// float64bits returns the IEEE 754 binary representation of f. +// Taken from math.Float64bits to avoid dependences into package math. +func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) } diff --git a/src/runtime/fastlog2_test.go b/src/runtime/fastlog2_test.go new file mode 100644 index 0000000000..8937365d51 --- /dev/null +++ b/src/runtime/fastlog2_test.go @@ -0,0 +1,28 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime_test + +import ( + "math" + "runtime" + "testing" +) + +func TestFastLog2(t *testing.T) { + // Compute the euclidean distance between math.Log2 and the FastLog2 + // implementation over the range of interest for heap sampling. + const randomBitCount = 26 + var e float64 + for i := 1; i < 1< 1.0 { + t.Fatalf("imprecision on fastlog2 implementation, want <=1.0, got %f", e) + } +} diff --git a/src/runtime/fastlog2table.go b/src/runtime/fastlog2table.go new file mode 100644 index 0000000000..c36d5835f6 --- /dev/null +++ b/src/runtime/fastlog2table.go @@ -0,0 +1,43 @@ +// AUTO-GENERATED by mkfastlog2table.go +// Run go generate from src/runtime to update. +// See mkfastlog2table.go for comments. + +package runtime + +const fastlogNumBits = 5 + +var fastlog2Table = [1< 0x3fffffff { // make 2*rate not overflow - rate = 0x3fffffff - } - next := int32(fastrand1()) % (2 * int32(rate)) - // Subtract the "remainder" of the current allocation. - // Otherwise objects that are close in size to sampling rate - // will be under-sampled, because we consistently discard this remainder. - next -= (int32(size) - c.next_sample) - if next < 0 { - next = 0 - } - c.next_sample = next - } - + mp.mcache.next_sample = nextSample() mProf_Malloc(x, size) } +// nextSample returns the next sampling point for heap profiling. +// It produces a random variable with a geometric distribution and +// mean MemProfileRate. This is done by generating a uniformly +// distributed random number and applying the cumulative distribution +// function for an exponential. +func nextSample() int32 { + period := MemProfileRate + + // make nextSample not overflow. Maximum possible step is + // -ln(1/(1< 0x7000000: + period = 0x7000000 + case period == 0: + return 0 + } + + // Let m be the sample rate, + // the probability distribution function is m*exp(-mx), so the CDF is + // p = 1 - exp(-mx), so + // q = 1 - p == exp(-mx) + // log_e(q) = -mx + // -log_e(q)/m = x + // x = -log_e(q) * period + // x = log_2(q) * (-log_e(2)) * period ; Using log_2 for efficiency + const randomBitCount = 26 + q := uint32(fastrand1())%(1< 0 { + qlog = 0 + } + const minusLog2 = -0.6931471805599453 // -ln(2) + return int32(qlog*(minusLog2*float64(period))) + 1 +} + type persistentAlloc struct { base unsafe.Pointer off uintptr diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 8c2a6b00ce..7424691b1f 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -69,16 +69,7 @@ func allocmcache() *mcache { for i := 0; i < _NumSizeClasses; i++ { c.alloc[i] = &emptymspan } - - // Set first allocation sample size. - rate := MemProfileRate - if rate > 0x3fffffff { // make 2*rate not overflow - rate = 0x3fffffff - } - if rate != 0 { - c.next_sample = int32(int(fastrand1()) % (2 * rate)) - } - + c.next_sample = nextSample() return c } diff --git a/src/runtime/mkfastlog2table.go b/src/runtime/mkfastlog2table.go new file mode 100644 index 0000000000..587ebf476d --- /dev/null +++ b/src/runtime/mkfastlog2table.go @@ -0,0 +1,52 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +// fastlog2Table contains log2 approximations for 5 binary digits. +// This is used to implement fastlog2, which is used for heap sampling. + +package main + +import ( + "bytes" + "fmt" + "io/ioutil" + "log" + "math" +) + +func main() { + var buf bytes.Buffer + + fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go") + fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.") + fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.") + fmt.Fprintln(&buf) + fmt.Fprintln(&buf, "package runtime") + fmt.Fprintln(&buf) + fmt.Fprintln(&buf, "const fastlogNumBits =", fastlogNumBits) + fmt.Fprintln(&buf) + + fmt.Fprintln(&buf, "var fastlog2Table = [1< ln { + firstLine = ln + } + } + var totalcount int64 + for i, w := range size { + ln := firstLine + i + s := a[ln] + checkValue(fname, ln, "objects", count, s.objects) + checkValue(fname, ln, "bytes", count*w, s.bytes) + totalcount += s.objects + } + // Check the total number of allocations, to ensure some sampling occurred. + if totalwant := count * int64(len(size)); totalcount <= 0 || totalcount > totalwant*1024 { + panic(fmt.Sprintf("%s want total count > 0 && <= %d, got %d", fname, totalwant*1024, totalcount)) + } +} + +// checkValue checks an unsampled value against a range. +func checkValue(fname string, ln int, name string, want, got int64) { + if got < 0 || got > 1024*want { + panic(fmt.Sprintf("%s:%d want %s >= 0 && <= %d, got %d", fname, ln, name, 1024*want, got)) + } +} + +func getMemProfileRecords() []runtime.MemProfileRecord { + // Find out how many records there are (MemProfile(nil, true)), + // allocate that many records, and get the data. + // There's a race—more records might be added between + // the two calls—so allocate a few extra records for safety + // and also try again if we're very unlucky. + // The loop should only execute one iteration in the common case. + var p []runtime.MemProfileRecord + n, ok := runtime.MemProfile(nil, true) + for { + // Allocate room for a slightly bigger profile, + // in case a few more entries have been added + // since the call to MemProfile. + p = make([]runtime.MemProfileRecord, n+50) + n, ok = runtime.MemProfile(p, true) + if ok { + p = p[0:n] + break + } + // Profile grew; try again. + } + return p +} + +type allocStat struct { + bytes, objects int64 +} + +// allocObjects examines the profile records for the named function +// and returns the allocation stats aggregated by source line number. +func allocObjects(records []runtime.MemProfileRecord, function string) map[int]allocStat { + a := make(map[int]allocStat) + for _, r := range records { + for _, s := range r.Stack0 { + if s == 0 { + break + } + if f := runtime.FuncForPC(s); f != nil { + name := f.Name() + _, line := f.FileLine(s) + if name == function { + allocStat := a[line] + allocStat.bytes += r.AllocBytes + allocStat.objects += r.AllocObjects + a[line] = allocStat + } + } + } + } + for line, stats := range a { + objects, bytes := scaleHeapSample(stats.objects, stats.bytes, int64(runtime.MemProfileRate)) + a[line] = allocStat{bytes, objects} + } + return a +} + +// scaleHeapSample unsamples heap allocations. +// Taken from src/cmd/pprof/internal/profile/legacy_profile.go +func scaleHeapSample(count, size, rate int64) (int64, int64) { + if count == 0 || size == 0 { + return 0, 0 + } + + if rate <= 1 { + // if rate==1 all samples were collected so no adjustment is needed. + // if rate<1 treat as unknown and skip scaling. + return count, size + } + + avgSize := float64(size) / float64(count) + scale := 1 / (1 - math.Exp(-avgSize/float64(rate))) + + return int64(float64(count) * scale), int64(float64(size) * scale) +}