internal/runtime/cgroup: CPU cgroup limit discovery

author Michael Pratt <mpratt@google.com>

Tue, 22 Apr 2025 10:24:37 +0000 (10:24 +0000)

committer Michael Pratt <mpratt@google.com>

Wed, 21 May 2025 17:21:47 +0000 (10:21 -0700)
author Michael Pratt <mpratt@google.com>
Tue, 22 Apr 2025 10:24:37 +0000 (10:24 +0000)
committer Michael Pratt <mpratt@google.com>
Wed, 21 May 2025 17:21:47 +0000 (10:21 -0700)
diff --git a/src/internal/runtime/cgroup/cgroup_linux.go b/src/internal/runtime/cgroup/cgroup_linux.go

new file mode 100644 (file)

index 0000000..2fc3b22
--- /dev/null
+++ b/src/internal/runtime/cgroup/cgroup_linux.go
@@ -0,0 +1,710 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgroup
+
+import (
+       "internal/bytealg"
+       "internal/runtime/strconv"
+       "internal/runtime/syscall"
+)
+
+var (
+       ErrNoCgroup error = stringError("not in a cgroup")
+
+       errMalformedFile error = stringError("malformed file")
+)
+
+const _PATH_MAX = 4096
+
+const (
+       // Required amount of scratch space for CPULimit.
+       //
+       // TODO(prattmic): This is shockingly large (~70KiB) due to the (very
+       // unlikely) combination of extremely long paths consisting mostly
+       // escaped characters. The scratch buffer ends up in .bss in package
+       // runtime, so it doesn't contribute to binary size and generally won't
+       // be faulted in, but it would still be nice to shrink this. A more
+       // complex parser that did not need to keep entire lines in memory
+       // could get away with much less. Alternatively, we could do a one-off
+       // mmap allocation for this buffer, which is only mapped larger if we
+       // actually need the extra space.
+       ScratchSize = PathSize + ParseSize
+
+       // Required space to store a path of the cgroup in the filesystem.
+       PathSize = _PATH_MAX
+
+       // /proc/self/mountinfo path escape sequences are 4 characters long, so
+       // a path consisting entirely of escaped characters could be 4 times
+       // larger.
+       escapedPathMax = 4 * _PATH_MAX
+
+       // Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
+       // See findCPUMount and findCPURelativePath.
+       ParseSize = 4 * escapedPathMax
+)
+
+// Include explicit NUL to be sure we include it in the slice.
+const (
+       v2MaxFile    = "/cpu.max\x00"
+       v1QuotaFile  = "/cpu.cfs_quota_us\x00"
+       v1PeriodFile = "/cpu.cfs_period_us\x00"
+)
+
+// Version indicates the cgroup version.
+type Version int
+
+const (
+       VersionUnknown Version = iota
+       V1
+       V2
+)
+
+// CPU owns the FDs required to read the CPU limit from a cgroup.
+type CPU struct {
+       version Version
+
+       // For cgroup v1, this is cpu.cfs_quota_us.
+       // For cgroup v2, this is cpu.max.
+       quotaFD int
+
+       // For cgroup v1, this is cpu.cfs_period_us.
+       // For cgroup v2, this is unused.
+       periodFD int
+}
+
+func (c CPU) Close() {
+       switch c.version {
+       case V1:
+               syscall.Close(c.quotaFD)
+               syscall.Close(c.periodFD)
+       case V2:
+               syscall.Close(c.quotaFD)
+       default:
+               throw("impossible cgroup version")
+       }
+}
+
+func checkBufferSize(s []byte, size int) {
+       if len(s) != size {
+               println("runtime: cgroup buffer length", len(s), "want", size)
+               throw("runtime: cgroup invalid buffer length")
+       }
+}
+
+// OpenCPU returns a CPU for the CPU cgroup containing the current process, or
+// ErrNoCgroup if the process is not in a CPU cgroup.
+//
+// scratch must have length ScratchSize.
+func OpenCPU(scratch []byte) (CPU, error) {
+       checkBufferSize(scratch, ScratchSize)
+
+       base := scratch[:PathSize]
+       scratch2 := scratch[PathSize:]
+
+       n, version, err := FindCPU(base, scratch2)
+       if err != nil {
+               return CPU{}, err
+       }
+
+       switch version {
+       case 1:
+               n2 := copy(base[n:], v1QuotaFile)
+               path := base[:n+n2]
+               quotaFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
+               if errno != 0 {
+                       // This may fail if this process was migrated out of
+                       // the cgroup found by FindCPU and that cgroup has been
+                       // deleted.
+                       return CPU{}, errSyscallFailed
+               }
+
+               n2 = copy(base[n:], v1PeriodFile)
+               path = base[:n+n2]
+               periodFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
+               if errno != 0 {
+                       // This may fail if this process was migrated out of
+                       // the cgroup found by FindCPU and that cgroup has been
+                       // deleted.
+                       return CPU{}, errSyscallFailed
+               }
+
+               c := CPU{
+                       version:  1,
+                       quotaFD:  quotaFD,
+                       periodFD: periodFD,
+               }
+               return c, nil
+       case 2:
+               n2 := copy(base[n:], v2MaxFile)
+               path := base[:n+n2]
+               maxFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
+               if errno != 0 {
+                       // This may fail if this process was migrated out of
+                       // the cgroup found by FindCPU and that cgroup has been
+                       // deleted.
+                       return CPU{}, errSyscallFailed
+               }
+
+               c := CPU{
+                       version:  2,
+                       quotaFD:  maxFD,
+                       periodFD: -1,
+               }
+               return c, nil
+       default:
+               throw("impossible cgroup version")
+               panic("unreachable")
+       }
+}
+
+// Returns average CPU throughput limit from the cgroup, or ok false if there
+// is no limit.
+func ReadCPULimit(c CPU) (float64, bool, error) {
+       switch c.version {
+       case 1:
+               quota, err := readV1Number(c.quotaFD)
+               if err != nil {
+                       return 0, false, errMalformedFile
+               }
+
+               if quota < 0 {
+                       // No limit.
+                       return 0, false, nil
+               }
+
+               period, err := readV1Number(c.periodFD)
+               if err != nil {
+                       return 0, false, errMalformedFile
+               }
+
+               return float64(quota) / float64(period), true, nil
+       case 2:
+               // quotaFD is the cpu.max FD.
+               return readV2Limit(c.quotaFD)
+       default:
+               throw("impossible cgroup version")
+               panic("unreachable")
+       }
+}
+
+// Returns the value from the quota/period file.
+func readV1Number(fd int) (int64, error) {
+       // The format of the file is "<value>\n" where the value is in
+       // int64 microseconds and, if quota, may be -1 to indicate no limit.
+       //
+       // MaxInt64 requires 19 bytes to display in base 10, thus the
+       // conservative max size of this file is 19 + 1 (newline) = 20 bytes.
+       // We'll provide a bit more for good measure.
+       //
+       // Always read from the beginning of the file to get a fresh value.
+       var b [64]byte
+       n, errno := syscall.Pread(fd, b[:], 0)
+       if errno != 0 {
+               return 0, errSyscallFailed
+       }
+       if n == len(b) {
+               return 0, errMalformedFile
+       }
+
+       buf := b[:n]
+       return parseV1Number(buf)
+}
+
+func parseV1Number(buf []byte) (int64, error) {
+       // Ignore trailing newline.
+       i := bytealg.IndexByte(buf, '\n')
+       if i < 0 {
+               return 0, errMalformedFile
+       }
+       buf = buf[:i]
+
+       val, ok := strconv.Atoi64(string(buf))
+       if !ok {
+               return 0, errMalformedFile
+       }
+
+       return val, nil
+}
+
+// Returns CPU throughput limit, or ok false if there is no limit.
+func readV2Limit(fd int) (float64, bool, error) {
+       // The format of the file is "<quota> <period>\n" where quota and
+       // period are microseconds and quota may be "max" to indicate no limit.
+       //
+       // Note that the kernel is inconsistent about whether the values are
+       // uint64 or int64: values are parsed as uint64 but printed as int64.
+       // See kernel/sched/core.c:cpu_max_{show,write}.
+       //
+       // In practice, the kernel limits the period to 1s (1000000us) (see
+       // max_cfs_quota_period), and the quota to (1<<44)us (see
+       // max_cfs_runtime), so these values can't get large enough for the
+       // distinction to matter.
+       //
+       // MaxInt64 requires 19 bytes to display in base 10, thus the
+       // conservative max size of this file is 19 + 19 + 1 (space) + 1
+       // (newline) = 40 bytes. We'll provide a bit more for good measure.
+       //
+       // Always read from the beginning of the file to get a fresh value.
+       var b [64]byte
+       n, errno := syscall.Pread(fd, b[:], 0)
+       if errno != 0 {
+               return 0, false, errSyscallFailed
+       }
+       if n == len(b) {
+               return 0, false, errMalformedFile
+       }
+
+       buf := b[:n]
+       return parseV2Limit(buf)
+}
+
+func parseV2Limit(buf []byte) (float64, bool, error) {
+       i := bytealg.IndexByte(buf, ' ')
+       if i < 0 {
+               return 0, false, errMalformedFile
+       }
+
+       quotaStr := buf[:i]
+       if bytealg.Compare(quotaStr, []byte("max")) == 0 {
+               // No limit.
+               return 0, false, nil
+       }
+
+       periodStr := buf[i+1:]
+       // Ignore trailing newline, if any.
+       i = bytealg.IndexByte(periodStr, '\n')
+       if i < 0 {
+               return 0, false, errMalformedFile
+       }
+       periodStr = periodStr[:i]
+
+       quota, ok := strconv.Atoi64(string(quotaStr))
+       if !ok {
+               return 0, false, errMalformedFile
+       }
+
+       period, ok := strconv.Atoi64(string(periodStr))
+       if !ok {
+               return 0, false, errMalformedFile
+       }
+
+       return float64(quota) / float64(period), true, nil
+}
+
+// FindCPU finds the path to the CPU cgroup that this process is a member of
+// and places it in out. scratch is a scratch buffer for internal use.
+//
+// out must have length PathSize. scratch must have length ParseSize.
+//
+// Returns the number of bytes written to out and the cgroup version (1 or 2).
+//
+// Returns ErrNoCgroup if the process is not in a CPU cgroup.
+func FindCPU(out []byte, scratch []byte) (int, Version, error) {
+       checkBufferSize(out, PathSize)
+       checkBufferSize(scratch, ParseSize)
+
+       // The cgroup path is <cgroup mount point> + <relative path>.
+       //
+       // This is racy if our cgroup is changed while this runs. For example,
+       // initially there is only a cgroup v2 mount and we are not in a
+       // cgroup. After, there a cgroup v1 mount with a CPU controller and we
+       // are placed in a cgroup in this hierarchy. In that case, findCPUMount
+       // could pick the v2 mount, and findCPURelativePath could find the v2
+       // relative path.
+       //
+       // In this case we'll later fail to read the cgroup files and fall back
+       // to assuming no cgroup.
+
+       n, err := FindCPUMountPoint(out, scratch)
+       if err != nil {
+               return 0, 0, err
+       }
+
+       // The relative path always starts with /, so we can directly append it
+       // to the mount point.
+       n2, version, err := FindCPURelativePath(out[n:], scratch)
+       if err != nil {
+               return 0, 0, err
+       }
+       n += n2
+
+       return n, version, nil
+}
+
+// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
+// relative to the root of the cgroup mount and places it in out. scratch is a
+// scratch buffer for internal use.
+//
+// out must have length PathSize minus the size of the cgroup mount root (if
+// known). scratch must have length ParseSize.
+//
+// Returns the number of bytes written to out and the cgroup version (1 or 2).
+//
+// Returns ErrNoCgroup if the process is not in a CPU cgroup.
+func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
+       path := []byte("/proc/self/cgroup\x00")
+       fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
+       if errno == syscall.ENOENT {
+               return 0, 0, ErrNoCgroup
+       } else if errno != 0 {
+               return 0, 0, errSyscallFailed
+       }
+
+       // The relative path always starts with /, so we can directly append it
+       // to the mount point.
+       n, version, err := parseCPURelativePath(fd, syscall.Read, out[:], scratch)
+       if err != nil {
+               syscall.Close(fd)
+               return 0, 0, err
+       }
+
+       syscall.Close(fd)
+       return n, version, nil
+}
+
+// Finds the path of the current process's CPU cgroup relative to the cgroup
+// mount and writes it to out.
+//
+// Returns the number of bytes written and the cgroup version (1 or 2).
+func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
+       // The format of each line is
+       //
+       //   hierarchy-ID:controller-list:cgroup-path
+       //
+       // controller-list is comma-separated.
+       // See man 5 cgroup for more details.
+       //
+       // cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
+       // is the CPU controller. Otherwise the v2 hierarchy (if any) is the
+       // CPU controller.
+       //
+       // hierarchy-ID and controller-list have relatively small maximum
+       // sizes, and the path can be up to _PATH_MAX, so we need a bit more
+       // than 1 _PATH_MAX of scratch space.
+
+       l := newLineReader(fd, scratch, read)
+
+       // Bytes written to out.
+       n := 0
+
+       for {
+               err := l.next()
+               if err == errIncompleteLine {
+                       // Don't allow incomplete lines. While in theory the
+                       // incomplete line may be for a controller we don't
+                       // care about, in practice all lines should be of
+                       // similar length, so we should just have a buffer big
+                       // enough for any.
+                       return 0, 0, err
+               } else if err == errEOF {
+                       break
+               } else if err != nil {
+                       return 0, 0, err
+               }
+
+               line := l.line()
+
+               // The format of each line is
+               //
+               //   hierarchy-ID:controller-list:cgroup-path
+               //
+               // controller-list is comma-separated.
+               // See man 5 cgroup for more details.
+               i := bytealg.IndexByte(line, ':')
+               if i < 0 {
+                       return 0, 0, errMalformedFile
+               }
+
+               hierarchy := line[:i]
+               line = line[i+1:]
+
+               i = bytealg.IndexByte(line, ':')
+               if i < 0 {
+                       return 0, 0, errMalformedFile
+               }
+
+               controllers := line[:i]
+               line = line[i+1:]
+
+               path := line
+
+               if string(hierarchy) == "0" {
+                       // v2 hierarchy.
+                       n = copy(out, path)
+                       // Keep searching, we might find a v1 hierarchy with a
+                       // CPU controller, which takes precedence.
+               } else {
+                       // v1 hierarchy
+                       if containsCPU(controllers) {
+                               // Found a v1 CPU controller. This must be the
+                               // only one, so we're done.
+                               return copy(out, path), V1, nil
+                       }
+               }
+       }
+
+       if n == 0 {
+               // Found nothing.
+               return 0, 0, ErrNoCgroup
+       }
+
+       // Must be v2, v1 returns above.
+       return n, V2, nil
+}
+
+// Returns true if comma-separated list b contains "cpu".
+func containsCPU(b []byte) bool {
+       for len(b) > 0 {
+               i := bytealg.IndexByte(b, ',')
+               if i < 0 {
+                       // Neither cmd/compile nor gccgo allocates for these string conversions.
+                       return string(b) == "cpu"
+               }
+
+               curr := b[:i]
+               rest := b[i+1:]
+
+               if string(curr) == "cpu" {
+                       return true
+               }
+
+               b = rest
+       }
+
+       return false
+}
+
+// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
+// scratch is a scratch buffer for internal use.
+//
+// out must have length PathSize. scratch must have length ParseSize.
+//
+// Returns the number of bytes written to out.
+//
+// Returns ErrNoCgroup if the process is not in a CPU cgroup.
+func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
+       checkBufferSize(out, PathSize)
+       checkBufferSize(scratch, ParseSize)
+
+       path := []byte("/proc/self/mountinfo\x00")
+       fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
+       if errno == syscall.ENOENT {
+               return 0, ErrNoCgroup
+       } else if errno != 0 {
+               return 0, errSyscallFailed
+       }
+
+       n, err := parseCPUMount(fd, syscall.Read, out, scratch)
+       if err != nil {
+               syscall.Close(fd)
+               return 0, err
+       }
+       syscall.Close(fd)
+
+       return n, nil
+}
+
+// Returns the mount point for the cpu cgroup controller (v1 or v2) from
+// /proc/self/mountinfo.
+func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) {
+       // The format of each line is:
+       //
+       // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+       // (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+       //
+       // (1) mount ID:  unique identifier of the mount (may be reused after umount)
+       // (2) parent ID:  ID of parent (or of self for the top of the mount tree)
+       // (3) major:minor:  value of st_dev for files on filesystem
+       // (4) root:  root of the mount within the filesystem
+       // (5) mount point:  mount point relative to the process's root
+       // (6) mount options:  per mount options
+       // (7) optional fields:  zero or more fields of the form "tag[:value]"
+       // (8) separator:  marks the end of the optional fields
+       // (9) filesystem type:  name of filesystem of the form "type[.subtype]"
+       // (10) mount source:  filesystem specific information or "none"
+       // (11) super options:  per super block options
+       //
+       // See man 5 proc_pid_mountinfo for more details.
+       //
+       // Note that emitted paths will not contain space, tab, newline, or
+       // carriage return. Those are escaped. See Linux show_mountinfo ->
+       // show_path. We must unescape before returning.
+       //
+       // We return the mount point (5) if the filesystem type (9) is cgroup2,
+       // or cgroup with "cpu" in the super options (11).
+       //
+       // (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
+       // small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
+       // Note that non-cgroup mounts may have arbitrarily long (11), but we
+       // can skip those when parsing.
+
+       l := newLineReader(fd, scratch, read)
+
+       // Bytes written to out.
+       n := 0
+
+       for {
+               //incomplete := false
+               err := l.next()
+               if err == errIncompleteLine {
+                       // An incomplete line is fine as long as it doesn't
+                       // impede parsing the fields we need. It shouldn't be
+                       // possible for any mount to use more than 3*PATH_MAX
+                       // before (9) because there are two paths and all other
+                       // earlier fields have bounded options. Only (11) has
+                       // unbounded options.
+               } else if err == errEOF {
+                       break
+               } else if err != nil {
+                       return 0, err
+               }
+
+               line := l.line()
+
+               // Skip first four fields.
+               for range 4 {
+                       i := bytealg.IndexByte(line, ' ')
+                       if i < 0 {
+                               return 0, errMalformedFile
+                       }
+                       line = line[i+1:]
+               }
+
+               // (5) mount point:  mount point relative to the process's root
+               i := bytealg.IndexByte(line, ' ')
+               if i < 0 {
+                       return 0, errMalformedFile
+               }
+               mnt := line[:i]
+               line = line[i+1:]
+
+               // Skip ahead past optional fields, delimited by " - ".
+               for {
+                       i = bytealg.IndexByte(line, ' ')
+                       if i < 0 {
+                               return 0, errMalformedFile
+                       }
+                       if i+3 >= len(line) {
+                               return 0, errMalformedFile
+                       }
+                       delim := line[i : i+3]
+                       if string(delim) == " - " {
+                               line = line[i+3:]
+                               break
+                       }
+                       line = line[i+1:]
+               }
+
+               // (9) filesystem type:  name of filesystem of the form "type[.subtype]"
+               i = bytealg.IndexByte(line, ' ')
+               if i < 0 {
+                       return 0, errMalformedFile
+               }
+               ftype := line[:i]
+               line = line[i+1:]
+
+               if string(ftype) != "cgroup" && string(ftype) != "cgroup2" {
+                       continue
+               }
+
+               // As in findCPUPath, cgroup v1 with a CPU controller takes
+               // precendence over cgroup v2.
+               if string(ftype) == "cgroup2" {
+                       // v2 hierarchy.
+                       n, err = unescapePath(out, mnt)
+                       if err != nil {
+                               // Don't keep searching on error. The kernel
+                               // should never produce broken escaping.
+                               return n, err
+                       }
+                       // Keep searching, we might find a v1 hierarchy with a
+                       // CPU controller, which takes precedence.
+                       continue
+               }
+
+               // (10) mount source:  filesystem specific information or "none"
+               i = bytealg.IndexByte(line, ' ')
+               if i < 0 {
+                       return 0, errMalformedFile
+               }
+               // Don't care about mount source.
+               line = line[i+1:]
+
+               // (11) super options:  per super block options
+               superOpt := line
+
+               // v1 hierarchy
+               if containsCPU(superOpt) {
+                       // Found a v1 CPU controller. This must be the
+                       // only one, so we're done.
+                       return unescapePath(out, mnt)
+               }
+       }
+
+       if n == 0 {
+               // Found nothing.
+               return 0, ErrNoCgroup
+       }
+
+       return n, nil
+}
+
+var errInvalidEscape error = stringError("invalid path escape sequence")
+
+// unescapePath copies in to out, unescaping escape sequences generated by
+// Linux's show_path.
+//
+// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
+// like '\040' for space.
+//
+// out must be at least as large as in.
+//
+// Returns the number of bytes written to out.
+//
+// Also see escapePath in cgroup_linux_test.go.
+func unescapePath(out []byte, in []byte) (int, error) {
+       // Not strictly necessary, but simplifies the implementation and will
+       // always hold in users.
+       if len(out) < len(in) {
+               throw("output too small")
+       }
+
+       var outi, ini int
+       for ini < len(in) {
+               c := in[ini]
+               if c != '\\' {
+                       out[outi] = c
+                       outi++
+                       ini++
+                       continue
+               }
+
+               // Start of escape sequence.
+
+               // Escape sequence is always 4 characters: one slash and three
+               // digits.
+               if ini+3 >= len(in) {
+                       return outi, errInvalidEscape
+               }
+
+               var outc byte
+               for i := range 3 {
+                       c := in[ini+1+i]
+                       if c < '0' || c > '9' {
+                               return outi, errInvalidEscape
+                       }
+
+                       outc *= 8
+                       outc += c - '0'
+               }
+
+               out[outi] = outc
+               outi++
+
+               ini += 4
+       }
+
+       return outi, nil
+}
diff --git a/src/internal/runtime/cgroup/cgroup_linux_test.go b/src/internal/runtime/cgroup/cgroup_linux_test.go

new file mode 100644 (file)

index 0000000..d47fe42
--- /dev/null
+++ b/src/internal/runtime/cgroup/cgroup_linux_test.go
@@ -0,0 +1,476 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgroup_test
+
+import (
+       "fmt"
+       "internal/runtime/cgroup"
+       "io"
+       "strconv"
+       "strings"
+       "testing"
+)
+
+const _PATH_MAX = 4096
+
+func TestParseV1Number(t *testing.T) {
+       tests := []struct {
+               name     string
+               contents string
+               want     int64
+               wantErr  bool
+       }{
+               {
+                       name:     "disabled",
+                       contents: "-1\n",
+                       want:     -1,
+               },
+               {
+                       name:     "500000",
+                       contents: "500000\n",
+                       want:     500000,
+               },
+               {
+                       name:     "MaxInt64",
+                       contents: "9223372036854775807\n",
+                       want:     9223372036854775807,
+               },
+               {
+                       name:     "missing-newline",
+                       contents: "500000",
+                       wantErr:  true,
+               },
+               {
+                       name:     "not-a-number",
+                       contents: "123max\n",
+                       wantErr:  true,
+               },
+               {
+                       name:     "v2",
+                       contents: "1000 5000\n",
+                       wantErr:  true,
+               },
+       }
+
+       for _, tc := range tests {
+               t.Run(tc.name, func(t *testing.T) {
+                       got, err := cgroup.ParseV1Number([]byte(tc.contents))
+                       if tc.wantErr {
+                               if err == nil {
+                                       t.Fatalf("parseV1Number got err nil want non-nil")
+                               }
+                               return
+                       }
+                       if err != nil {
+                               t.Fatalf("parseV1Number got err %v want nil", err)
+                       }
+
+                       if got != tc.want {
+                               t.Errorf("parseV1Number got %d want %d", got, tc.want)
+                       }
+               })
+       }
+}
+
+func TestParseV2Limit(t *testing.T) {
+       tests := []struct {
+               name     string
+               contents string
+               want     float64
+               wantOK   bool
+               wantErr  bool
+       }{
+               {
+                       name:     "disabled",
+                       contents: "max 100000\n",
+                       wantOK:   false,
+               },
+               {
+                       name:     "5",
+                       contents: "500000 100000\n",
+                       want:     5,
+                       wantOK:   true,
+               },
+               {
+                       name:     "0.5",
+                       contents: "50000 100000\n",
+                       want:     0.5,
+                       wantOK:   true,
+               },
+               {
+                       name:     "2.5",
+                       contents: "250000 100000\n",
+                       want:     2.5,
+                       wantOK:   true,
+               },
+               {
+                       name:     "MaxInt64",
+                       contents: "9223372036854775807 9223372036854775807\n",
+                       want:     1,
+                       wantOK:   true,
+               },
+               {
+                       name:     "missing-newline",
+                       contents: "500000 100000",
+                       wantErr:  true,
+               },
+               {
+                       name:     "v1",
+                       contents: "500000\n",
+                       wantErr:  true,
+               },
+               {
+                       name:     "quota-not-a-number",
+                       contents: "500000us 100000\n",
+                       wantErr:  true,
+               },
+               {
+                       name:     "period-not-a-number",
+                       contents: "500000 100000us\n",
+                       wantErr:  true,
+               },
+       }
+
+       for _, tc := range tests {
+               t.Run(tc.name, func(t *testing.T) {
+                       got, gotOK, err := cgroup.ParseV2Limit([]byte(tc.contents))
+                       if tc.wantErr {
+                               if err == nil {
+                                       t.Fatalf("parseV1Limit got err nil want non-nil")
+                               }
+                               return
+                       }
+                       if err != nil {
+                               t.Fatalf("parseV2Limit got err %v want nil", err)
+                       }
+
+                       if gotOK != tc.wantOK {
+                               t.Errorf("parseV2Limit got ok %v want %v", gotOK, tc.wantOK)
+                       }
+
+                       if tc.wantOK && got != tc.want {
+                               t.Errorf("parseV2Limit got %f want %f", got, tc.want)
+                       }
+               })
+       }
+}
+
+func TestParseCPURelativePath(t *testing.T) {
+       tests := []struct {
+               name     string
+               contents string
+               want     string
+               wantVer  cgroup.Version
+               wantErr  bool
+       }{
+               {
+                       name:     "empty",
+                       contents: "",
+                       wantErr:  true,
+               },
+               {
+                       name: "v1",
+                       contents: `2:cpu,cpuacct:/a/b/cpu
+1:blkio:/a/b/blkio
+`,
+                       want:    "/a/b/cpu",
+                       wantVer: cgroup.V1,
+               },
+               {
+                       name:     "v2",
+                       contents: "0::/a/b/c\n",
+                       want:     "/a/b/c",
+                       wantVer:  cgroup.V2,
+               },
+               {
+                       name: "mixed",
+                       contents: `2:cpu,cpuacct:/a/b/cpu
+1:blkio:/a/b/blkio
+0::/a/b/v2
+`,
+                       want:    "/a/b/cpu",
+                       wantVer: cgroup.V1,
+               },
+       }
+
+       for _, tc := range tests {
+               t.Run(tc.name, func(t *testing.T) {
+                       r := strings.NewReader(tc.contents)
+                       read := func(fd int, b []byte) (int, uintptr) {
+                               n, err := r.Read(b)
+                               if err != nil && err != io.EOF {
+                                       const dummyErrno = 42
+                                       return n, dummyErrno
+                               }
+                               return n, 0
+                       }
+
+                       var got [cgroup.PathSize]byte
+                       var scratch [cgroup.ParseSize]byte
+                       n, gotVer, err := cgroup.ParseCPURelativePath(0, read, got[:], scratch[:])
+                       if (err != nil) != tc.wantErr {
+                               t.Fatalf("parseCPURelativePath got err %v want %v", err, tc.wantErr)
+                       }
+
+                       if gotVer != tc.wantVer {
+                               t.Errorf("parseCPURelativePath got cgroup version %d want %d", gotVer, tc.wantVer)
+                       }
+
+                       if string(got[:n]) != tc.want {
+                               t.Errorf("parseCPURelativePath got %q want %q", string(got[:n]), tc.want)
+                       }
+               })
+       }
+}
+
+func TestContainsCPU(t *testing.T) {
+       tests := []struct {
+               in   string
+               want bool
+       }{
+               {
+                       in:   "",
+                       want: false,
+               },
+               {
+                       in:   ",",
+                       want: false,
+               },
+               {
+                       in:   "cpu",
+                       want: true,
+               },
+               {
+                       in:   "memory,cpu",
+                       want: true,
+               },
+               {
+                       in:   "cpu,memory",
+                       want: true,
+               },
+               {
+                       in:   "memory,cpu,block",
+                       want: true,
+               },
+               {
+                       in:   "memory,cpuacct,block",
+                       want: false,
+               },
+       }
+
+       for _, tc := range tests {
+               t.Run(tc.in, func(t *testing.T) {
+                       got := cgroup.ContainsCPU([]byte(tc.in))
+                       if got != tc.want {
+                               t.Errorf("containsCPU(%q) got %v want %v", tc.in, got, tc.want)
+                       }
+               })
+       }
+}
+
+func TestParseCPUMount(t *testing.T) {
+       // Used for v2-longline. We want an overlayfs mount to have an option
+       // so long that the entire line can't possibly fit in the scratch
+       // buffer.
+       const lowerPath = "/so/many/overlay/layers"
+       overlayLongLowerDir := lowerPath
+       for i := 0; len(overlayLongLowerDir) < cgroup.ScratchSize; i++ {
+               overlayLongLowerDir += fmt.Sprintf(":%s%d", lowerPath, i)
+       }
+
+       tests := []struct {
+               name     string
+               contents string
+               want     string
+               wantErr  bool
+       }{
+               {
+                       name:     "empty",
+                       contents: "",
+                       wantErr:  true,
+               },
+               {
+                       name: "v1",
+                       contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+49 22 0:37 / /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory
+54 22 0:38 / /sys/fs/cgroup/io rw - cgroup cgroup rw,io
+56 22 0:40 / /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct
+58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net
+59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
+`,
+                       want: "/sys/fs/cgroup/cpu",
+               },
+               {
+                       name: "v2",
+                       contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+                       want: "/sys/fs/cgroup",
+               },
+               {
+                       name: "mixed",
+                       contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+49 22 0:37 / /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory
+54 22 0:38 / /sys/fs/cgroup/io rw - cgroup cgroup rw,io
+56 22 0:40 / /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct
+58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net
+59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
+`,
+                       want: "/sys/fs/cgroup/cpu",
+               },
+               {
+                       name: "v2-escaped",
+                       contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/fs/cgroup/tab\011tab rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+                       want: `/sys/fs/cgroup/tab       tab`,
+               },
+               {
+                       // Overly long line on a different mount doesn't matter.
+                       name: "v2-longline",
+                       contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+262 31 0:72 / /tmp/overlay2/0143e063b02f4801de9c847ad1c5ddc21fd2ead00653064d0c72ea967b248870/merged rw,relatime shared:729 - overlay overlay rw,lowerdir=` + overlayLongLowerDir + `,upperdir=/tmp/diff,workdir=/tmp/work
+25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+                       want: "/sys/fs/cgroup",
+               },
+       }
+
+       for _, tc := range tests {
+               t.Run(tc.name, func(t *testing.T) {
+                       r := strings.NewReader(tc.contents)
+                       read := func(fd int, b []byte) (int, uintptr) {
+                               n, err := r.Read(b)
+                               if err != nil && err != io.EOF {
+                                       const dummyErrno = 42
+                                       return n, dummyErrno
+                               }
+                               return n, 0
+                       }
+
+                       var got [cgroup.PathSize]byte
+                       var scratch [cgroup.ParseSize]byte
+                       n, err := cgroup.ParseCPUMount(0, read, got[:], scratch[:])
+                       if (err != nil) != tc.wantErr {
+                               t.Fatalf("parseCPUMount got err %v want %v", err, tc.wantErr)
+                       }
+
+                       if string(got[:n]) != tc.want {
+                               t.Errorf("parseCPUMount got %q want %q", string(got[:n]), tc.want)
+                       }
+               })
+       }
+}
+
+// escapePath performs escaping equivalent to Linux's show_path.
+//
+// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
+// like '\040' for space.
+func escapePath(s string) string {
+       out := make([]rune, 0, len(s))
+       for _, c := range s {
+               switch c {
+               case '\\', ' ', '\t', '\n':
+                       out = append(out, '\\')
+                       cs := strconv.FormatInt(int64(c), 8)
+                       if len(cs) <= 2 {
+                               out = append(out, '0')
+                       }
+                       if len(cs) <= 1 {
+                               out = append(out, '0')
+                       }
+                       for _, csc := range cs {
+                               out = append(out, csc)
+                       }
+               default:
+                       out = append(out, c)
+               }
+       }
+       return string(out)
+}
+
+func TestEscapePath(t *testing.T) {
+       tests := []struct {
+               name      string
+               unescaped string
+               escaped   string
+       }{
+               {
+                       name:      "boring",
+                       unescaped: `/a/b/c`,
+                       escaped:   `/a/b/c`,
+               },
+               {
+                       name:      "space",
+                       unescaped: `/a/b b/c`,
+                       escaped:   `/a/b\040b/c`,
+               },
+               {
+                       name:      "tab",
+                       unescaped: `/a/b        b/c`,
+                       escaped:   `/a/b\011b/c`,
+               },
+               {
+                       name: "newline",
+                       unescaped: `/a/b
+b/c`,
+                       escaped: `/a/b\012b/c`,
+               },
+               {
+                       name:      "slash",
+                       unescaped: `/a/b\b/c`,
+                       escaped:   `/a/b\134b/c`,
+               },
+               {
+                       name:      "beginning",
+                       unescaped: `\b/c`,
+                       escaped:   `\134b/c`,
+               },
+               {
+                       name:      "ending",
+                       unescaped: `/a/\`,
+                       escaped:   `/a/\134`,
+               },
+       }
+
+       t.Run("escapePath", func(t *testing.T) {
+               for _, tc := range tests {
+                       t.Run(tc.name, func(t *testing.T) {
+                               got := escapePath(tc.unescaped)
+                               if got != tc.escaped {
+                                       t.Errorf("escapePath got %q want %q", got, tc.escaped)
+                               }
+                       })
+               }
+       })
+
+       t.Run("unescapePath", func(t *testing.T) {
+               for _, tc := range tests {
+                       t.Run(tc.name, func(t *testing.T) {
+                               in := []byte(tc.escaped)
+                               out := make([]byte, len(in))
+                               n, err := cgroup.UnescapePath(out, in)
+                               if err != nil {
+                                       t.Errorf("unescapePath got err %v want nil", err)
+                               }
+                               got := string(out[:n])
+                               if got != tc.unescaped {
+                                       t.Errorf("unescapePath got %q want %q", got, tc.escaped)
+                               }
+                       })
+               }
+       })
+}
diff --git a/src/internal/runtime/cgroup/export_linux_test.go b/src/internal/runtime/cgroup/export_linux_test.go

new file mode 100644 (file)

index 0000000..653fcd1
--- /dev/null
+++ b/src/internal/runtime/cgroup/export_linux_test.go
@@ -0,0 +1,15 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgroup
+
+var ContainsCPU = containsCPU
+
+var ParseV1Number = parseV1Number
+var ParseV2Limit = parseV2Limit
+
+var ParseCPURelativePath = parseCPURelativePath
+var ParseCPUMount = parseCPUMount
+
+var UnescapePath = unescapePath
diff --git a/src/internal/runtime/cgroup/runtime.go b/src/internal/runtime/cgroup/runtime.go

new file mode 100644 (file)

index 0000000..39c9295
--- /dev/null
+++ b/src/internal/runtime/cgroup/runtime.go
@@ -0,0 +1,14 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgroup
+
+import (
+       _ "unsafe" // for linkname
+)
+
+// Functions below pushed from runtime.
+
+//go:linkname throw
+func throw(s string)
diff --git a/src/runtime/panic.go b/src/runtime/panic.go

index b8f23cc3c2d1fc851fb58d03e7c9a5152bd84b73..95305b84bc67ce68e927f3b2e2450e888e9e388f 100644 (file)
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -1056,6 +1056,11 @@ func internal_sync_fatal(s string) {
         fatal(s)
  }
  
+//go:linkname cgroup_throw internal/runtime/cgroup.throw
+func cgroup_throw(s string) {
+       throw(s)
+}
+
  // throw triggers a fatal error that dumps a stack trace and exits.
  //
  // throw should be used for runtime-internal fatal errors where Go itself,
author	Michael Pratt <mpratt@google.com>
	Tue, 22 Apr 2025 10:24:37 +0000 (10:24 +0000)
committer	Michael Pratt <mpratt@google.com>
	Wed, 21 May 2025 17:21:47 +0000 (10:21 -0700)
src/internal/runtime/cgroup/cgroup_linux.go	[new file with mode: 0644]	patch \| blob
src/internal/runtime/cgroup/cgroup_linux_test.go	[new file with mode: 0644]	patch \| blob
src/internal/runtime/cgroup/export_linux_test.go	[new file with mode: 0644]	patch \| blob
src/internal/runtime/cgroup/runtime.go	[new file with mode: 0644]	patch \| blob
src/runtime/panic.go		patch \| blob \| history