pprof: improve sampling for heap profiling

author Raul Silvera <rsilvera@google.com>

Mon, 14 Sep 2015 21:03:45 +0000 (14:03 -0700)

committer Minux Ma <minux@golang.org>

Mon, 5 Oct 2015 08:15:09 +0000 (08:15 +0000)
author Raul Silvera <rsilvera@google.com>
Mon, 14 Sep 2015 21:03:45 +0000 (14:03 -0700)
committer Minux Ma <minux@golang.org>
Mon, 5 Oct 2015 08:15:09 +0000 (08:15 +0000)
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index 06ffbf6191812f6bd9dcb1ba298b0c09fabf72b1..d7e43f4fe28995e077b3db40946b378ef4c45c70 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -26,6 +26,8 @@ var Xadduintptr = xadduintptr
  
  var FuncPC = funcPC
  
+var Fastlog2 = fastlog2
+
  type LFNode struct {
         Next    uint64
         Pushcnt uintptr
diff --git a/src/runtime/fastlog2.go b/src/runtime/fastlog2.go

new file mode 100644 (file)

index 0000000..b22e825
--- /dev/null
+++ b/src/runtime/fastlog2.go
@@ -0,0 +1,33 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// fastlog2 implements a fast approximation to the base 2 log of a
+// float64. This is used to compute a geometric distribution for heap
+// sampling, without introducing dependences into package math. This
+// uses a very rough approximation using the float64 exponent and the
+// first 25 bits of the mantissa. The top 5 bits of the mantissa are
+// used to load limits from a table of constants and the rest are used
+// to scale linearly between them.
+func fastlog2(x float64) float64 {
+       const fastlogScaleBits = 20
+       const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits)
+
+       xBits := float64bits(x)
+       // Extract the exponent from the IEEE float64, and index a constant
+       // table with the first 10 bits from the mantissa.
+       xExp := int64((xBits>>52)&0x7FF) - 1023
+       xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits)
+       xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits)
+
+       low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
+       return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
+}
+
+// float64bits returns the IEEE 754 binary representation of f.
+// Taken from math.Float64bits to avoid dependences into package math.
+func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }
diff --git a/src/runtime/fastlog2_test.go b/src/runtime/fastlog2_test.go

new file mode 100644 (file)

index 0000000..8937365
--- /dev/null
+++ b/src/runtime/fastlog2_test.go
@@ -0,0 +1,28 @@
+// Copyright 2015 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+       "math"
+       "runtime"
+       "testing"
+)
+
+func TestFastLog2(t *testing.T) {
+       // Compute the euclidean distance between math.Log2 and the FastLog2
+       // implementation over the range of interest for heap sampling.
+       const randomBitCount = 26
+       var e float64
+       for i := 1; i < 1<<randomBitCount; i++ {
+               l, fl := math.Log2(float64(i)), runtime.Fastlog2(float64(i))
+               d := l - fl
+               e += d * d
+       }
+       e = math.Sqrt(e)
+
+       if e > 1.0 {
+               t.Fatalf("imprecision on fastlog2 implementation, want <=1.0, got %f", e)
+       }
+}
diff --git a/src/runtime/fastlog2table.go b/src/runtime/fastlog2table.go

new file mode 100644 (file)

index 0000000..c36d583
--- /dev/null
+++ b/src/runtime/fastlog2table.go
@@ -0,0 +1,43 @@
+// AUTO-GENERATED by mkfastlog2table.go
+// Run go generate from src/runtime to update.
+// See mkfastlog2table.go for comments.
+
+package runtime
+
+const fastlogNumBits = 5
+
+var fastlog2Table = [1<<fastlogNumBits + 1]float64{
+       0,
+       0.0443941193584535,
+       0.08746284125033943,
+       0.12928301694496647,
+       0.16992500144231248,
+       0.2094533656289499,
+       0.24792751344358555,
+       0.28540221886224837,
+       0.3219280948873623,
+       0.3575520046180837,
+       0.39231742277876036,
+       0.4262647547020979,
+       0.4594316186372973,
+       0.4918530963296748,
+       0.5235619560570128,
+       0.5545888516776374,
+       0.5849625007211563,
+       0.6147098441152082,
+       0.6438561897747247,
+       0.6724253419714956,
+       0.7004397181410922,
+       0.7279204545631992,
+       0.7548875021634686,
+       0.7813597135246596,
+       0.8073549220576042,
+       0.8328900141647417,
+       0.8579809951275721,
+       0.8826430493618412,
+       0.9068905956085185,
+       0.9307373375628862,
+       0.9541963103868752,
+       0.9772799234999164,
+       1,
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go

index f038debdd3d36a77161dbed221212aa8113d0d39..29f81a09f5327731de8acbab8ce99375b38daa6d 100644 (file)
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -792,28 +792,45 @@ func rawmem(size uintptr) unsafe.Pointer {
  }
  
  func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-       c := mp.mcache
-       rate := MemProfileRate
-       if size < uintptr(rate) {
-               // pick next profile time
-               // If you change this, also change allocmcache.
-               if rate > 0x3fffffff { // make 2*rate not overflow
-                       rate = 0x3fffffff
-               }
-               next := int32(fastrand1()) % (2 * int32(rate))
-               // Subtract the "remainder" of the current allocation.
-               // Otherwise objects that are close in size to sampling rate
-               // will be under-sampled, because we consistently discard this remainder.
-               next -= (int32(size) - c.next_sample)
-               if next < 0 {
-                       next = 0
-               }
-               c.next_sample = next
-       }
-
+       mp.mcache.next_sample = nextSample()
         mProf_Malloc(x, size)
  }
  
+// nextSample returns the next sampling point for heap profiling.
+// It produces a random variable with a geometric distribution and
+// mean MemProfileRate. This is done by generating a uniformly
+// distributed random number and applying the cumulative distribution
+// function for an exponential.
+func nextSample() int32 {
+       period := MemProfileRate
+
+       // make nextSample not overflow. Maximum possible step is
+       // -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+       switch {
+       case period > 0x7000000:
+               period = 0x7000000
+       case period == 0:
+               return 0
+       }
+
+       // Let m be the sample rate,
+       // the probability distribution function is m*exp(-mx), so the CDF is
+       // p = 1 - exp(-mx), so
+       // q = 1 - p == exp(-mx)
+       // log_e(q) = -mx
+       // -log_e(q)/m = x
+       // x = -log_e(q) * period
+       // x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+       const randomBitCount = 26
+       q := uint32(fastrand1())%(1<<randomBitCount) + 1
+       qlog := fastlog2(float64(q)) - randomBitCount
+       if qlog > 0 {
+               qlog = 0
+       }
+       const minusLog2 = -0.6931471805599453 // -ln(2)
+       return int32(qlog*(minusLog2*float64(period))) + 1
+}
+
  type persistentAlloc struct {
         base unsafe.Pointer
         off  uintptr
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go

index 8c2a6b00cef4614c90fc240003adbd033d2ca7ef..7424691b1f5077afecf5db74eae3c51c165f0333 100644 (file)
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -69,16 +69,7 @@ func allocmcache() *mcache {
         for i := 0; i < _NumSizeClasses; i++ {
                 c.alloc[i] = &emptymspan
         }
-
-       // Set first allocation sample size.
-       rate := MemProfileRate
-       if rate > 0x3fffffff { // make 2*rate not overflow
-               rate = 0x3fffffff
-       }
-       if rate != 0 {
-               c.next_sample = int32(int(fastrand1()) % (2 * rate))
-       }
-
+       c.next_sample = nextSample()
         return c
  }
  
diff --git a/src/runtime/mkfastlog2table.go b/src/runtime/mkfastlog2table.go

new file mode 100644 (file)

index 0000000..587ebf4
--- /dev/null
+++ b/src/runtime/mkfastlog2table.go
@@ -0,0 +1,52 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// fastlog2Table contains log2 approximations for 5 binary digits.
+// This is used to implement fastlog2, which is used for heap sampling.
+
+package main
+
+import (
+       "bytes"
+       "fmt"
+       "io/ioutil"
+       "log"
+       "math"
+)
+
+func main() {
+       var buf bytes.Buffer
+
+       fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go")
+       fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
+       fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.")
+       fmt.Fprintln(&buf)
+       fmt.Fprintln(&buf, "package runtime")
+       fmt.Fprintln(&buf)
+       fmt.Fprintln(&buf, "const fastlogNumBits =", fastlogNumBits)
+       fmt.Fprintln(&buf)
+
+       fmt.Fprintln(&buf, "var fastlog2Table = [1<<fastlogNumBits + 1]float64{")
+       table := computeTable()
+       for _, t := range table {
+               fmt.Fprintf(&buf, "\t%v,\n", t)
+       }
+       fmt.Fprintln(&buf, "}")
+
+       if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
+               log.Fatalln(err)
+       }
+}
+
+const fastlogNumBits = 5
+
+func computeTable() []float64 {
+       fastlog2Table := make([]float64, 1<<fastlogNumBits+1)
+       for i := 0; i <= (1 << fastlogNumBits); i++ {
+               fastlog2Table[i] = math.Log2(1.0 + float64(i)/(1<<fastlogNumBits))
+       }
+       return fastlog2Table
+}
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go

index 2387d9ae8baf67520bf2385f1e4a999d66894c5b..81d3e5b3c38e8bc6fa317d10b371c4b1c0def618 100644 (file)
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -8,6 +8,7 @@ import _ "unsafe" // for go:linkname
  
  //go:generate go run wincallback.go
  //go:generate go run mkduff.go
+//go:generate go run mkfastlog2table.go
  
  var ticks struct {
         lock mutex
diff --git a/test/heapsampling.go b/test/heapsampling.go

new file mode 100644 (file)

index 0000000..d5ffc7f
--- /dev/null
+++ b/test/heapsampling.go
@@ -0,0 +1,166 @@
+// run
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test heap sampling logic.
+
+package main
+
+import (
+       "fmt"
+       "math"
+       "runtime"
+)
+
+var a16 *[16]byte
+var a512 *[512]byte
+var a256 *[256]byte
+var a1k *[1024]byte
+var a64k *[64 * 1024]byte
+
+// This test checks that heap sampling produces reasonable
+// results. Note that heap sampling uses randomization, so the results
+// vary for run to run. This test only checks that the resulting
+// values appear reasonable.
+func main() {
+       const countInterleaved = 10000
+       allocInterleaved(countInterleaved)
+       checkAllocations(getMemProfileRecords(), "main.allocInterleaved", countInterleaved, []int64{256 * 1024, 1024, 256 * 1024, 512, 256 * 1024, 256})
+
+       const count = 100000
+       alloc(count)
+       checkAllocations(getMemProfileRecords(), "main.alloc", count, []int64{1024, 512, 256})
+}
+
+// allocInterleaved stress-tests the heap sampling logic by
+// interleaving large and small allocations.
+func allocInterleaved(n int) {
+       for i := 0; i < n; i++ {
+               // Test verification depends on these lines being contiguous.
+               a64k = new([64 * 1024]byte)
+               a1k = new([1024]byte)
+               a64k = new([64 * 1024]byte)
+               a512 = new([512]byte)
+               a64k = new([64 * 1024]byte)
+               a256 = new([256]byte)
+       }
+}
+
+// alloc performs only small allocations for sanity testing.
+func alloc(n int) {
+       for i := 0; i < n; i++ {
+               // Test verification depends on these lines being contiguous.
+               a1k = new([1024]byte)
+               a512 = new([512]byte)
+               a256 = new([256]byte)
+       }
+}
+
+// checkAllocations validates that the profile records collected for
+// the named function are consistent with count contiguous allocations
+// of the specified sizes.
+func checkAllocations(records []runtime.MemProfileRecord, fname string, count int64, size []int64) {
+       a := allocObjects(records, fname)
+       firstLine := 0
+       for ln := range a {
+               if firstLine == 0 || firstLine > ln {
+                       firstLine = ln
+               }
+       }
+       var totalcount int64
+       for i, w := range size {
+               ln := firstLine + i
+               s := a[ln]
+               checkValue(fname, ln, "objects", count, s.objects)
+               checkValue(fname, ln, "bytes", count*w, s.bytes)
+               totalcount += s.objects
+       }
+       // Check the total number of allocations, to ensure some sampling occurred.
+       if totalwant := count * int64(len(size)); totalcount <= 0 || totalcount > totalwant*1024 {
+               panic(fmt.Sprintf("%s want total count > 0 && <= %d, got %d", fname, totalwant*1024, totalcount))
+       }
+}
+
+// checkValue checks an unsampled value against a range.
+func checkValue(fname string, ln int, name string, want, got int64) {
+       if got < 0 || got > 1024*want {
+               panic(fmt.Sprintf("%s:%d want %s >= 0 && <= %d, got %d", fname, ln, name, 1024*want, got))
+       }
+}
+
+func getMemProfileRecords() []runtime.MemProfileRecord {
+       // Find out how many records there are (MemProfile(nil, true)),
+       // allocate that many records, and get the data.
+       // There's a race—more records might be added between
+       // the two calls—so allocate a few extra records for safety
+       // and also try again if we're very unlucky.
+       // The loop should only execute one iteration in the common case.
+       var p []runtime.MemProfileRecord
+       n, ok := runtime.MemProfile(nil, true)
+       for {
+               // Allocate room for a slightly bigger profile,
+               // in case a few more entries have been added
+               // since the call to MemProfile.
+               p = make([]runtime.MemProfileRecord, n+50)
+               n, ok = runtime.MemProfile(p, true)
+               if ok {
+                       p = p[0:n]
+                       break
+               }
+               // Profile grew; try again.
+       }
+       return p
+}
+
+type allocStat struct {
+       bytes, objects int64
+}
+
+// allocObjects examines the profile records for the named function
+// and returns the allocation stats aggregated by source line number.
+func allocObjects(records []runtime.MemProfileRecord, function string) map[int]allocStat {
+       a := make(map[int]allocStat)
+       for _, r := range records {
+               for _, s := range r.Stack0 {
+                       if s == 0 {
+                               break
+                       }
+                       if f := runtime.FuncForPC(s); f != nil {
+                               name := f.Name()
+                               _, line := f.FileLine(s)
+                               if name == function {
+                                       allocStat := a[line]
+                                       allocStat.bytes += r.AllocBytes
+                                       allocStat.objects += r.AllocObjects
+                                       a[line] = allocStat
+                               }
+                       }
+               }
+       }
+       for line, stats := range a {
+               objects, bytes := scaleHeapSample(stats.objects, stats.bytes, int64(runtime.MemProfileRate))
+               a[line] = allocStat{bytes, objects}
+       }
+       return a
+}
+
+// scaleHeapSample unsamples heap allocations.
+// Taken from src/cmd/pprof/internal/profile/legacy_profile.go
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+       if count == 0 || size == 0 {
+               return 0, 0
+       }
+
+       if rate <= 1 {
+               // if rate==1 all samples were collected so no adjustment is needed.
+               // if rate<1 treat as unknown and skip scaling.
+               return count, size
+       }
+
+       avgSize := float64(size) / float64(count)
+       scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+       return int64(float64(count) * scale), int64(float64(size) * scale)
+}
author	Raul Silvera <rsilvera@google.com>
	Mon, 14 Sep 2015 21:03:45 +0000 (14:03 -0700)
committer	Minux Ma <minux@golang.org>
	Mon, 5 Oct 2015 08:15:09 +0000 (08:15 +0000)
src/runtime/export_test.go		patch \| blob \| history
src/runtime/fastlog2.go	[new file with mode: 0644]	patch \| blob
src/runtime/fastlog2_test.go	[new file with mode: 0644]	patch \| blob
src/runtime/fastlog2table.go	[new file with mode: 0644]	patch \| blob
src/runtime/malloc.go		patch \| blob \| history
src/runtime/mcache.go		patch \| blob \| history
src/runtime/mkfastlog2table.go	[new file with mode: 0644]	patch \| blob
src/runtime/runtime.go		patch \| blob \| history
test/heapsampling.go	[new file with mode: 0644]	patch \| blob