From f12c66fbed546645389cf184b0e2ffd6ad9f78ec Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Tue, 22 Apr 2025 10:24:37 +0000 Subject: [PATCH] internal/runtime/cgroup: CPU cgroup limit discovery For #73193. Change-Id: I6a6a636ca9fa9cba429cf053468c56c2939cb1ac Reviewed-on: https://go-review.googlesource.com/c/go/+/668638 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek --- src/internal/runtime/cgroup/cgroup_linux.go | 710 ++++++++++++++++++ .../runtime/cgroup/cgroup_linux_test.go | 476 ++++++++++++ .../runtime/cgroup/export_linux_test.go | 15 + src/internal/runtime/cgroup/runtime.go | 14 + src/runtime/panic.go | 5 + 5 files changed, 1220 insertions(+) create mode 100644 src/internal/runtime/cgroup/cgroup_linux.go create mode 100644 src/internal/runtime/cgroup/cgroup_linux_test.go create mode 100644 src/internal/runtime/cgroup/export_linux_test.go create mode 100644 src/internal/runtime/cgroup/runtime.go diff --git a/src/internal/runtime/cgroup/cgroup_linux.go b/src/internal/runtime/cgroup/cgroup_linux.go new file mode 100644 index 0000000000..2fc3b225c5 --- /dev/null +++ b/src/internal/runtime/cgroup/cgroup_linux.go @@ -0,0 +1,710 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cgroup + +import ( + "internal/bytealg" + "internal/runtime/strconv" + "internal/runtime/syscall" +) + +var ( + ErrNoCgroup error = stringError("not in a cgroup") + + errMalformedFile error = stringError("malformed file") +) + +const _PATH_MAX = 4096 + +const ( + // Required amount of scratch space for CPULimit. + // + // TODO(prattmic): This is shockingly large (~70KiB) due to the (very + // unlikely) combination of extremely long paths consisting mostly + // escaped characters. The scratch buffer ends up in .bss in package + // runtime, so it doesn't contribute to binary size and generally won't + // be faulted in, but it would still be nice to shrink this. A more + // complex parser that did not need to keep entire lines in memory + // could get away with much less. Alternatively, we could do a one-off + // mmap allocation for this buffer, which is only mapped larger if we + // actually need the extra space. + ScratchSize = PathSize + ParseSize + + // Required space to store a path of the cgroup in the filesystem. + PathSize = _PATH_MAX + + // /proc/self/mountinfo path escape sequences are 4 characters long, so + // a path consisting entirely of escaped characters could be 4 times + // larger. + escapedPathMax = 4 * _PATH_MAX + + // Required space to parse /proc/self/mountinfo and /proc/self/cgroup. + // See findCPUMount and findCPURelativePath. + ParseSize = 4 * escapedPathMax +) + +// Include explicit NUL to be sure we include it in the slice. +const ( + v2MaxFile = "/cpu.max\x00" + v1QuotaFile = "/cpu.cfs_quota_us\x00" + v1PeriodFile = "/cpu.cfs_period_us\x00" +) + +// Version indicates the cgroup version. +type Version int + +const ( + VersionUnknown Version = iota + V1 + V2 +) + +// CPU owns the FDs required to read the CPU limit from a cgroup. +type CPU struct { + version Version + + // For cgroup v1, this is cpu.cfs_quota_us. + // For cgroup v2, this is cpu.max. + quotaFD int + + // For cgroup v1, this is cpu.cfs_period_us. + // For cgroup v2, this is unused. + periodFD int +} + +func (c CPU) Close() { + switch c.version { + case V1: + syscall.Close(c.quotaFD) + syscall.Close(c.periodFD) + case V2: + syscall.Close(c.quotaFD) + default: + throw("impossible cgroup version") + } +} + +func checkBufferSize(s []byte, size int) { + if len(s) != size { + println("runtime: cgroup buffer length", len(s), "want", size) + throw("runtime: cgroup invalid buffer length") + } +} + +// OpenCPU returns a CPU for the CPU cgroup containing the current process, or +// ErrNoCgroup if the process is not in a CPU cgroup. +// +// scratch must have length ScratchSize. +func OpenCPU(scratch []byte) (CPU, error) { + checkBufferSize(scratch, ScratchSize) + + base := scratch[:PathSize] + scratch2 := scratch[PathSize:] + + n, version, err := FindCPU(base, scratch2) + if err != nil { + return CPU{}, err + } + + switch version { + case 1: + n2 := copy(base[n:], v1QuotaFile) + path := base[:n+n2] + quotaFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0) + if errno != 0 { + // This may fail if this process was migrated out of + // the cgroup found by FindCPU and that cgroup has been + // deleted. + return CPU{}, errSyscallFailed + } + + n2 = copy(base[n:], v1PeriodFile) + path = base[:n+n2] + periodFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0) + if errno != 0 { + // This may fail if this process was migrated out of + // the cgroup found by FindCPU and that cgroup has been + // deleted. + return CPU{}, errSyscallFailed + } + + c := CPU{ + version: 1, + quotaFD: quotaFD, + periodFD: periodFD, + } + return c, nil + case 2: + n2 := copy(base[n:], v2MaxFile) + path := base[:n+n2] + maxFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0) + if errno != 0 { + // This may fail if this process was migrated out of + // the cgroup found by FindCPU and that cgroup has been + // deleted. + return CPU{}, errSyscallFailed + } + + c := CPU{ + version: 2, + quotaFD: maxFD, + periodFD: -1, + } + return c, nil + default: + throw("impossible cgroup version") + panic("unreachable") + } +} + +// Returns average CPU throughput limit from the cgroup, or ok false if there +// is no limit. +func ReadCPULimit(c CPU) (float64, bool, error) { + switch c.version { + case 1: + quota, err := readV1Number(c.quotaFD) + if err != nil { + return 0, false, errMalformedFile + } + + if quota < 0 { + // No limit. + return 0, false, nil + } + + period, err := readV1Number(c.periodFD) + if err != nil { + return 0, false, errMalformedFile + } + + return float64(quota) / float64(period), true, nil + case 2: + // quotaFD is the cpu.max FD. + return readV2Limit(c.quotaFD) + default: + throw("impossible cgroup version") + panic("unreachable") + } +} + +// Returns the value from the quota/period file. +func readV1Number(fd int) (int64, error) { + // The format of the file is "\n" where the value is in + // int64 microseconds and, if quota, may be -1 to indicate no limit. + // + // MaxInt64 requires 19 bytes to display in base 10, thus the + // conservative max size of this file is 19 + 1 (newline) = 20 bytes. + // We'll provide a bit more for good measure. + // + // Always read from the beginning of the file to get a fresh value. + var b [64]byte + n, errno := syscall.Pread(fd, b[:], 0) + if errno != 0 { + return 0, errSyscallFailed + } + if n == len(b) { + return 0, errMalformedFile + } + + buf := b[:n] + return parseV1Number(buf) +} + +func parseV1Number(buf []byte) (int64, error) { + // Ignore trailing newline. + i := bytealg.IndexByte(buf, '\n') + if i < 0 { + return 0, errMalformedFile + } + buf = buf[:i] + + val, ok := strconv.Atoi64(string(buf)) + if !ok { + return 0, errMalformedFile + } + + return val, nil +} + +// Returns CPU throughput limit, or ok false if there is no limit. +func readV2Limit(fd int) (float64, bool, error) { + // The format of the file is " \n" where quota and + // period are microseconds and quota may be "max" to indicate no limit. + // + // Note that the kernel is inconsistent about whether the values are + // uint64 or int64: values are parsed as uint64 but printed as int64. + // See kernel/sched/core.c:cpu_max_{show,write}. + // + // In practice, the kernel limits the period to 1s (1000000us) (see + // max_cfs_quota_period), and the quota to (1<<44)us (see + // max_cfs_runtime), so these values can't get large enough for the + // distinction to matter. + // + // MaxInt64 requires 19 bytes to display in base 10, thus the + // conservative max size of this file is 19 + 19 + 1 (space) + 1 + // (newline) = 40 bytes. We'll provide a bit more for good measure. + // + // Always read from the beginning of the file to get a fresh value. + var b [64]byte + n, errno := syscall.Pread(fd, b[:], 0) + if errno != 0 { + return 0, false, errSyscallFailed + } + if n == len(b) { + return 0, false, errMalformedFile + } + + buf := b[:n] + return parseV2Limit(buf) +} + +func parseV2Limit(buf []byte) (float64, bool, error) { + i := bytealg.IndexByte(buf, ' ') + if i < 0 { + return 0, false, errMalformedFile + } + + quotaStr := buf[:i] + if bytealg.Compare(quotaStr, []byte("max")) == 0 { + // No limit. + return 0, false, nil + } + + periodStr := buf[i+1:] + // Ignore trailing newline, if any. + i = bytealg.IndexByte(periodStr, '\n') + if i < 0 { + return 0, false, errMalformedFile + } + periodStr = periodStr[:i] + + quota, ok := strconv.Atoi64(string(quotaStr)) + if !ok { + return 0, false, errMalformedFile + } + + period, ok := strconv.Atoi64(string(periodStr)) + if !ok { + return 0, false, errMalformedFile + } + + return float64(quota) / float64(period), true, nil +} + +// FindCPU finds the path to the CPU cgroup that this process is a member of +// and places it in out. scratch is a scratch buffer for internal use. +// +// out must have length PathSize. scratch must have length ParseSize. +// +// Returns the number of bytes written to out and the cgroup version (1 or 2). +// +// Returns ErrNoCgroup if the process is not in a CPU cgroup. +func FindCPU(out []byte, scratch []byte) (int, Version, error) { + checkBufferSize(out, PathSize) + checkBufferSize(scratch, ParseSize) + + // The cgroup path is + . + // + // This is racy if our cgroup is changed while this runs. For example, + // initially there is only a cgroup v2 mount and we are not in a + // cgroup. After, there a cgroup v1 mount with a CPU controller and we + // are placed in a cgroup in this hierarchy. In that case, findCPUMount + // could pick the v2 mount, and findCPURelativePath could find the v2 + // relative path. + // + // In this case we'll later fail to read the cgroup files and fall back + // to assuming no cgroup. + + n, err := FindCPUMountPoint(out, scratch) + if err != nil { + return 0, 0, err + } + + // The relative path always starts with /, so we can directly append it + // to the mount point. + n2, version, err := FindCPURelativePath(out[n:], scratch) + if err != nil { + return 0, 0, err + } + n += n2 + + return n, version, nil +} + +// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of +// relative to the root of the cgroup mount and places it in out. scratch is a +// scratch buffer for internal use. +// +// out must have length PathSize minus the size of the cgroup mount root (if +// known). scratch must have length ParseSize. +// +// Returns the number of bytes written to out and the cgroup version (1 or 2). +// +// Returns ErrNoCgroup if the process is not in a CPU cgroup. +func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) { + path := []byte("/proc/self/cgroup\x00") + fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0) + if errno == syscall.ENOENT { + return 0, 0, ErrNoCgroup + } else if errno != 0 { + return 0, 0, errSyscallFailed + } + + // The relative path always starts with /, so we can directly append it + // to the mount point. + n, version, err := parseCPURelativePath(fd, syscall.Read, out[:], scratch) + if err != nil { + syscall.Close(fd) + return 0, 0, err + } + + syscall.Close(fd) + return n, version, nil +} + +// Finds the path of the current process's CPU cgroup relative to the cgroup +// mount and writes it to out. +// +// Returns the number of bytes written and the cgroup version (1 or 2). +func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) { + // The format of each line is + // + // hierarchy-ID:controller-list:cgroup-path + // + // controller-list is comma-separated. + // See man 5 cgroup for more details. + // + // cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that + // is the CPU controller. Otherwise the v2 hierarchy (if any) is the + // CPU controller. + // + // hierarchy-ID and controller-list have relatively small maximum + // sizes, and the path can be up to _PATH_MAX, so we need a bit more + // than 1 _PATH_MAX of scratch space. + + l := newLineReader(fd, scratch, read) + + // Bytes written to out. + n := 0 + + for { + err := l.next() + if err == errIncompleteLine { + // Don't allow incomplete lines. While in theory the + // incomplete line may be for a controller we don't + // care about, in practice all lines should be of + // similar length, so we should just have a buffer big + // enough for any. + return 0, 0, err + } else if err == errEOF { + break + } else if err != nil { + return 0, 0, err + } + + line := l.line() + + // The format of each line is + // + // hierarchy-ID:controller-list:cgroup-path + // + // controller-list is comma-separated. + // See man 5 cgroup for more details. + i := bytealg.IndexByte(line, ':') + if i < 0 { + return 0, 0, errMalformedFile + } + + hierarchy := line[:i] + line = line[i+1:] + + i = bytealg.IndexByte(line, ':') + if i < 0 { + return 0, 0, errMalformedFile + } + + controllers := line[:i] + line = line[i+1:] + + path := line + + if string(hierarchy) == "0" { + // v2 hierarchy. + n = copy(out, path) + // Keep searching, we might find a v1 hierarchy with a + // CPU controller, which takes precedence. + } else { + // v1 hierarchy + if containsCPU(controllers) { + // Found a v1 CPU controller. This must be the + // only one, so we're done. + return copy(out, path), V1, nil + } + } + } + + if n == 0 { + // Found nothing. + return 0, 0, ErrNoCgroup + } + + // Must be v2, v1 returns above. + return n, V2, nil +} + +// Returns true if comma-separated list b contains "cpu". +func containsCPU(b []byte) bool { + for len(b) > 0 { + i := bytealg.IndexByte(b, ',') + if i < 0 { + // Neither cmd/compile nor gccgo allocates for these string conversions. + return string(b) == "cpu" + } + + curr := b[:i] + rest := b[i+1:] + + if string(curr) == "cpu" { + return true + } + + b = rest + } + + return false +} + +// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out. +// scratch is a scratch buffer for internal use. +// +// out must have length PathSize. scratch must have length ParseSize. +// +// Returns the number of bytes written to out. +// +// Returns ErrNoCgroup if the process is not in a CPU cgroup. +func FindCPUMountPoint(out []byte, scratch []byte) (int, error) { + checkBufferSize(out, PathSize) + checkBufferSize(scratch, ParseSize) + + path := []byte("/proc/self/mountinfo\x00") + fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0) + if errno == syscall.ENOENT { + return 0, ErrNoCgroup + } else if errno != 0 { + return 0, errSyscallFailed + } + + n, err := parseCPUMount(fd, syscall.Read, out, scratch) + if err != nil { + syscall.Close(fd) + return 0, err + } + syscall.Close(fd) + + return n, nil +} + +// Returns the mount point for the cpu cgroup controller (v1 or v2) from +// /proc/self/mountinfo. +func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) { + // The format of each line is: + // + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + // + // (1) mount ID: unique identifier of the mount (may be reused after umount) + // (2) parent ID: ID of parent (or of self for the top of the mount tree) + // (3) major:minor: value of st_dev for files on filesystem + // (4) root: root of the mount within the filesystem + // (5) mount point: mount point relative to the process's root + // (6) mount options: per mount options + // (7) optional fields: zero or more fields of the form "tag[:value]" + // (8) separator: marks the end of the optional fields + // (9) filesystem type: name of filesystem of the form "type[.subtype]" + // (10) mount source: filesystem specific information or "none" + // (11) super options: per super block options + // + // See man 5 proc_pid_mountinfo for more details. + // + // Note that emitted paths will not contain space, tab, newline, or + // carriage return. Those are escaped. See Linux show_mountinfo -> + // show_path. We must unescape before returning. + // + // We return the mount point (5) if the filesystem type (9) is cgroup2, + // or cgroup with "cpu" in the super options (11). + // + // (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a + // small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space. + // Note that non-cgroup mounts may have arbitrarily long (11), but we + // can skip those when parsing. + + l := newLineReader(fd, scratch, read) + + // Bytes written to out. + n := 0 + + for { + //incomplete := false + err := l.next() + if err == errIncompleteLine { + // An incomplete line is fine as long as it doesn't + // impede parsing the fields we need. It shouldn't be + // possible for any mount to use more than 3*PATH_MAX + // before (9) because there are two paths and all other + // earlier fields have bounded options. Only (11) has + // unbounded options. + } else if err == errEOF { + break + } else if err != nil { + return 0, err + } + + line := l.line() + + // Skip first four fields. + for range 4 { + i := bytealg.IndexByte(line, ' ') + if i < 0 { + return 0, errMalformedFile + } + line = line[i+1:] + } + + // (5) mount point: mount point relative to the process's root + i := bytealg.IndexByte(line, ' ') + if i < 0 { + return 0, errMalformedFile + } + mnt := line[:i] + line = line[i+1:] + + // Skip ahead past optional fields, delimited by " - ". + for { + i = bytealg.IndexByte(line, ' ') + if i < 0 { + return 0, errMalformedFile + } + if i+3 >= len(line) { + return 0, errMalformedFile + } + delim := line[i : i+3] + if string(delim) == " - " { + line = line[i+3:] + break + } + line = line[i+1:] + } + + // (9) filesystem type: name of filesystem of the form "type[.subtype]" + i = bytealg.IndexByte(line, ' ') + if i < 0 { + return 0, errMalformedFile + } + ftype := line[:i] + line = line[i+1:] + + if string(ftype) != "cgroup" && string(ftype) != "cgroup2" { + continue + } + + // As in findCPUPath, cgroup v1 with a CPU controller takes + // precendence over cgroup v2. + if string(ftype) == "cgroup2" { + // v2 hierarchy. + n, err = unescapePath(out, mnt) + if err != nil { + // Don't keep searching on error. The kernel + // should never produce broken escaping. + return n, err + } + // Keep searching, we might find a v1 hierarchy with a + // CPU controller, which takes precedence. + continue + } + + // (10) mount source: filesystem specific information or "none" + i = bytealg.IndexByte(line, ' ') + if i < 0 { + return 0, errMalformedFile + } + // Don't care about mount source. + line = line[i+1:] + + // (11) super options: per super block options + superOpt := line + + // v1 hierarchy + if containsCPU(superOpt) { + // Found a v1 CPU controller. This must be the + // only one, so we're done. + return unescapePath(out, mnt) + } + } + + if n == 0 { + // Found nothing. + return 0, ErrNoCgroup + } + + return n, nil +} + +var errInvalidEscape error = stringError("invalid path escape sequence") + +// unescapePath copies in to out, unescaping escape sequences generated by +// Linux's show_path. +// +// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences, +// like '\040' for space. +// +// out must be at least as large as in. +// +// Returns the number of bytes written to out. +// +// Also see escapePath in cgroup_linux_test.go. +func unescapePath(out []byte, in []byte) (int, error) { + // Not strictly necessary, but simplifies the implementation and will + // always hold in users. + if len(out) < len(in) { + throw("output too small") + } + + var outi, ini int + for ini < len(in) { + c := in[ini] + if c != '\\' { + out[outi] = c + outi++ + ini++ + continue + } + + // Start of escape sequence. + + // Escape sequence is always 4 characters: one slash and three + // digits. + if ini+3 >= len(in) { + return outi, errInvalidEscape + } + + var outc byte + for i := range 3 { + c := in[ini+1+i] + if c < '0' || c > '9' { + return outi, errInvalidEscape + } + + outc *= 8 + outc += c - '0' + } + + out[outi] = outc + outi++ + + ini += 4 + } + + return outi, nil +} diff --git a/src/internal/runtime/cgroup/cgroup_linux_test.go b/src/internal/runtime/cgroup/cgroup_linux_test.go new file mode 100644 index 0000000000..d47fe42067 --- /dev/null +++ b/src/internal/runtime/cgroup/cgroup_linux_test.go @@ -0,0 +1,476 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cgroup_test + +import ( + "fmt" + "internal/runtime/cgroup" + "io" + "strconv" + "strings" + "testing" +) + +const _PATH_MAX = 4096 + +func TestParseV1Number(t *testing.T) { + tests := []struct { + name string + contents string + want int64 + wantErr bool + }{ + { + name: "disabled", + contents: "-1\n", + want: -1, + }, + { + name: "500000", + contents: "500000\n", + want: 500000, + }, + { + name: "MaxInt64", + contents: "9223372036854775807\n", + want: 9223372036854775807, + }, + { + name: "missing-newline", + contents: "500000", + wantErr: true, + }, + { + name: "not-a-number", + contents: "123max\n", + wantErr: true, + }, + { + name: "v2", + contents: "1000 5000\n", + wantErr: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := cgroup.ParseV1Number([]byte(tc.contents)) + if tc.wantErr { + if err == nil { + t.Fatalf("parseV1Number got err nil want non-nil") + } + return + } + if err != nil { + t.Fatalf("parseV1Number got err %v want nil", err) + } + + if got != tc.want { + t.Errorf("parseV1Number got %d want %d", got, tc.want) + } + }) + } +} + +func TestParseV2Limit(t *testing.T) { + tests := []struct { + name string + contents string + want float64 + wantOK bool + wantErr bool + }{ + { + name: "disabled", + contents: "max 100000\n", + wantOK: false, + }, + { + name: "5", + contents: "500000 100000\n", + want: 5, + wantOK: true, + }, + { + name: "0.5", + contents: "50000 100000\n", + want: 0.5, + wantOK: true, + }, + { + name: "2.5", + contents: "250000 100000\n", + want: 2.5, + wantOK: true, + }, + { + name: "MaxInt64", + contents: "9223372036854775807 9223372036854775807\n", + want: 1, + wantOK: true, + }, + { + name: "missing-newline", + contents: "500000 100000", + wantErr: true, + }, + { + name: "v1", + contents: "500000\n", + wantErr: true, + }, + { + name: "quota-not-a-number", + contents: "500000us 100000\n", + wantErr: true, + }, + { + name: "period-not-a-number", + contents: "500000 100000us\n", + wantErr: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, gotOK, err := cgroup.ParseV2Limit([]byte(tc.contents)) + if tc.wantErr { + if err == nil { + t.Fatalf("parseV1Limit got err nil want non-nil") + } + return + } + if err != nil { + t.Fatalf("parseV2Limit got err %v want nil", err) + } + + if gotOK != tc.wantOK { + t.Errorf("parseV2Limit got ok %v want %v", gotOK, tc.wantOK) + } + + if tc.wantOK && got != tc.want { + t.Errorf("parseV2Limit got %f want %f", got, tc.want) + } + }) + } +} + +func TestParseCPURelativePath(t *testing.T) { + tests := []struct { + name string + contents string + want string + wantVer cgroup.Version + wantErr bool + }{ + { + name: "empty", + contents: "", + wantErr: true, + }, + { + name: "v1", + contents: `2:cpu,cpuacct:/a/b/cpu +1:blkio:/a/b/blkio +`, + want: "/a/b/cpu", + wantVer: cgroup.V1, + }, + { + name: "v2", + contents: "0::/a/b/c\n", + want: "/a/b/c", + wantVer: cgroup.V2, + }, + { + name: "mixed", + contents: `2:cpu,cpuacct:/a/b/cpu +1:blkio:/a/b/blkio +0::/a/b/v2 +`, + want: "/a/b/cpu", + wantVer: cgroup.V1, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + r := strings.NewReader(tc.contents) + read := func(fd int, b []byte) (int, uintptr) { + n, err := r.Read(b) + if err != nil && err != io.EOF { + const dummyErrno = 42 + return n, dummyErrno + } + return n, 0 + } + + var got [cgroup.PathSize]byte + var scratch [cgroup.ParseSize]byte + n, gotVer, err := cgroup.ParseCPURelativePath(0, read, got[:], scratch[:]) + if (err != nil) != tc.wantErr { + t.Fatalf("parseCPURelativePath got err %v want %v", err, tc.wantErr) + } + + if gotVer != tc.wantVer { + t.Errorf("parseCPURelativePath got cgroup version %d want %d", gotVer, tc.wantVer) + } + + if string(got[:n]) != tc.want { + t.Errorf("parseCPURelativePath got %q want %q", string(got[:n]), tc.want) + } + }) + } +} + +func TestContainsCPU(t *testing.T) { + tests := []struct { + in string + want bool + }{ + { + in: "", + want: false, + }, + { + in: ",", + want: false, + }, + { + in: "cpu", + want: true, + }, + { + in: "memory,cpu", + want: true, + }, + { + in: "cpu,memory", + want: true, + }, + { + in: "memory,cpu,block", + want: true, + }, + { + in: "memory,cpuacct,block", + want: false, + }, + } + + for _, tc := range tests { + t.Run(tc.in, func(t *testing.T) { + got := cgroup.ContainsCPU([]byte(tc.in)) + if got != tc.want { + t.Errorf("containsCPU(%q) got %v want %v", tc.in, got, tc.want) + } + }) + } +} + +func TestParseCPUMount(t *testing.T) { + // Used for v2-longline. We want an overlayfs mount to have an option + // so long that the entire line can't possibly fit in the scratch + // buffer. + const lowerPath = "/so/many/overlay/layers" + overlayLongLowerDir := lowerPath + for i := 0; len(overlayLongLowerDir) < cgroup.ScratchSize; i++ { + overlayLongLowerDir += fmt.Sprintf(":%s%d", lowerPath, i) + } + + tests := []struct { + name string + contents string + want string + wantErr bool + }{ + { + name: "empty", + contents: "", + wantErr: true, + }, + { + name: "v1", + contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw +20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw +21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw +49 22 0:37 / /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory +54 22 0:38 / /sys/fs/cgroup/io rw - cgroup cgroup rw,io +56 22 0:40 / /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct +58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net +59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset +`, + want: "/sys/fs/cgroup/cpu", + }, + { + name: "v2", + contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw +20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw +21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw +25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw +`, + want: "/sys/fs/cgroup", + }, + { + name: "mixed", + contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw +20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw +21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw +25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw +49 22 0:37 / /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory +54 22 0:38 / /sys/fs/cgroup/io rw - cgroup cgroup rw,io +56 22 0:40 / /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct +58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net +59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset +`, + want: "/sys/fs/cgroup/cpu", + }, + { + name: "v2-escaped", + contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw +20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw +21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw +25 21 0:22 / /sys/fs/cgroup/tab\011tab rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw +`, + want: `/sys/fs/cgroup/tab tab`, + }, + { + // Overly long line on a different mount doesn't matter. + name: "v2-longline", + contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw +20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw +21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw +262 31 0:72 / /tmp/overlay2/0143e063b02f4801de9c847ad1c5ddc21fd2ead00653064d0c72ea967b248870/merged rw,relatime shared:729 - overlay overlay rw,lowerdir=` + overlayLongLowerDir + `,upperdir=/tmp/diff,workdir=/tmp/work +25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw +`, + want: "/sys/fs/cgroup", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + r := strings.NewReader(tc.contents) + read := func(fd int, b []byte) (int, uintptr) { + n, err := r.Read(b) + if err != nil && err != io.EOF { + const dummyErrno = 42 + return n, dummyErrno + } + return n, 0 + } + + var got [cgroup.PathSize]byte + var scratch [cgroup.ParseSize]byte + n, err := cgroup.ParseCPUMount(0, read, got[:], scratch[:]) + if (err != nil) != tc.wantErr { + t.Fatalf("parseCPUMount got err %v want %v", err, tc.wantErr) + } + + if string(got[:n]) != tc.want { + t.Errorf("parseCPUMount got %q want %q", string(got[:n]), tc.want) + } + }) + } +} + +// escapePath performs escaping equivalent to Linux's show_path. +// +// That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences, +// like '\040' for space. +func escapePath(s string) string { + out := make([]rune, 0, len(s)) + for _, c := range s { + switch c { + case '\\', ' ', '\t', '\n': + out = append(out, '\\') + cs := strconv.FormatInt(int64(c), 8) + if len(cs) <= 2 { + out = append(out, '0') + } + if len(cs) <= 1 { + out = append(out, '0') + } + for _, csc := range cs { + out = append(out, csc) + } + default: + out = append(out, c) + } + } + return string(out) +} + +func TestEscapePath(t *testing.T) { + tests := []struct { + name string + unescaped string + escaped string + }{ + { + name: "boring", + unescaped: `/a/b/c`, + escaped: `/a/b/c`, + }, + { + name: "space", + unescaped: `/a/b b/c`, + escaped: `/a/b\040b/c`, + }, + { + name: "tab", + unescaped: `/a/b b/c`, + escaped: `/a/b\011b/c`, + }, + { + name: "newline", + unescaped: `/a/b +b/c`, + escaped: `/a/b\012b/c`, + }, + { + name: "slash", + unescaped: `/a/b\b/c`, + escaped: `/a/b\134b/c`, + }, + { + name: "beginning", + unescaped: `\b/c`, + escaped: `\134b/c`, + }, + { + name: "ending", + unescaped: `/a/\`, + escaped: `/a/\134`, + }, + } + + t.Run("escapePath", func(t *testing.T) { + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := escapePath(tc.unescaped) + if got != tc.escaped { + t.Errorf("escapePath got %q want %q", got, tc.escaped) + } + }) + } + }) + + t.Run("unescapePath", func(t *testing.T) { + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + in := []byte(tc.escaped) + out := make([]byte, len(in)) + n, err := cgroup.UnescapePath(out, in) + if err != nil { + t.Errorf("unescapePath got err %v want nil", err) + } + got := string(out[:n]) + if got != tc.unescaped { + t.Errorf("unescapePath got %q want %q", got, tc.escaped) + } + }) + } + }) +} diff --git a/src/internal/runtime/cgroup/export_linux_test.go b/src/internal/runtime/cgroup/export_linux_test.go new file mode 100644 index 0000000000..653fcd1b2f --- /dev/null +++ b/src/internal/runtime/cgroup/export_linux_test.go @@ -0,0 +1,15 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cgroup + +var ContainsCPU = containsCPU + +var ParseV1Number = parseV1Number +var ParseV2Limit = parseV2Limit + +var ParseCPURelativePath = parseCPURelativePath +var ParseCPUMount = parseCPUMount + +var UnescapePath = unescapePath diff --git a/src/internal/runtime/cgroup/runtime.go b/src/internal/runtime/cgroup/runtime.go new file mode 100644 index 0000000000..39c9295b07 --- /dev/null +++ b/src/internal/runtime/cgroup/runtime.go @@ -0,0 +1,14 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cgroup + +import ( + _ "unsafe" // for linkname +) + +// Functions below pushed from runtime. + +//go:linkname throw +func throw(s string) diff --git a/src/runtime/panic.go b/src/runtime/panic.go index b8f23cc3c2..95305b84bc 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1056,6 +1056,11 @@ func internal_sync_fatal(s string) { fatal(s) } +//go:linkname cgroup_throw internal/runtime/cgroup.throw +func cgroup_throw(s string) { + throw(s) +} + // throw triggers a fatal error that dumps a stack trace and exits. // // throw should be used for runtime-internal fatal errors where Go itself, -- 2.50.0