Source file src/internal/runtime/cgroup/cgroup.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cgroup
     6  
     7  import (
     8  	"internal/bytealg"
     9  	"internal/strconv"
    10  )
    11  
    12  var (
    13  	ErrNoCgroup error = stringError("not in a cgroup")
    14  
    15  	errMalformedFile error = stringError("malformed file")
    16  )
    17  
    18  const _PATH_MAX = 4096
    19  
    20  const (
    21  	// Required amount of scratch space for CPULimit.
    22  	//
    23  	// TODO(prattmic): This is shockingly large (~70KiB) due to the (very
    24  	// unlikely) combination of extremely long paths consisting mostly
    25  	// escaped characters. The scratch buffer ends up in .bss in package
    26  	// runtime, so it doesn't contribute to binary size and generally won't
    27  	// be faulted in, but it would still be nice to shrink this. A more
    28  	// complex parser that did not need to keep entire lines in memory
    29  	// could get away with much less. Alternatively, we could do a one-off
    30  	// mmap allocation for this buffer, which is only mapped larger if we
    31  	// actually need the extra space.
    32  	ScratchSize = PathSize + ParseSize
    33  
    34  	// Required space to store a path of the cgroup in the filesystem.
    35  	PathSize = _PATH_MAX
    36  
    37  	// /proc/self/mountinfo path escape sequences are 4 characters long, so
    38  	// a path consisting entirely of escaped characters could be 4 times
    39  	// larger.
    40  	escapedPathMax = 4 * _PATH_MAX
    41  
    42  	// Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
    43  	// See findCPUMount and findCPURelativePath.
    44  	ParseSize = 4 * escapedPathMax
    45  )
    46  
    47  // Version indicates the cgroup version.
    48  type Version int
    49  
    50  const (
    51  	VersionUnknown Version = iota
    52  	V1
    53  	V2
    54  )
    55  
    56  func parseV1Number(buf []byte) (int64, error) {
    57  	// Ignore trailing newline.
    58  	i := bytealg.IndexByte(buf, '\n')
    59  	if i < 0 {
    60  		return 0, errMalformedFile
    61  	}
    62  	buf = buf[:i]
    63  
    64  	val, err := strconv.ParseInt(string(buf), 10, 64)
    65  	if err != nil {
    66  		return 0, errMalformedFile
    67  	}
    68  
    69  	return val, nil
    70  }
    71  
    72  func parseV2Limit(buf []byte) (float64, bool, error) {
    73  	i := bytealg.IndexByte(buf, ' ')
    74  	if i < 0 {
    75  		return 0, false, errMalformedFile
    76  	}
    77  
    78  	quotaStr := buf[:i]
    79  	if bytealg.Compare(quotaStr, []byte("max")) == 0 {
    80  		// No limit.
    81  		return 0, false, nil
    82  	}
    83  
    84  	periodStr := buf[i+1:]
    85  	// Ignore trailing newline, if any.
    86  	i = bytealg.IndexByte(periodStr, '\n')
    87  	if i < 0 {
    88  		return 0, false, errMalformedFile
    89  	}
    90  	periodStr = periodStr[:i]
    91  
    92  	quota, err := strconv.ParseInt(string(quotaStr), 10, 64)
    93  	if err != nil {
    94  		return 0, false, errMalformedFile
    95  	}
    96  
    97  	period, err := strconv.ParseInt(string(periodStr), 10, 64)
    98  	if err != nil {
    99  		return 0, false, errMalformedFile
   100  	}
   101  
   102  	return float64(quota) / float64(period), true, nil
   103  }
   104  
   105  // Finds the path of the current process's CPU cgroup and writes it to out.
   106  //
   107  // fd is a file descriptor for /proc/self/cgroup.
   108  // Returns the number of bytes written and the cgroup version (1 or 2).
   109  func parseCPUCgroup(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
   110  	// The format of each line is
   111  	//
   112  	//   hierarchy-ID:controller-list:cgroup-path
   113  	//
   114  	// controller-list is comma-separated.
   115  	//
   116  	// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
   117  	// is the CPU controller. Otherwise the v2 hierarchy (if any) is the
   118  	// CPU controller. It is not possible to mount the same controller
   119  	// simultaneously under both the v1 and the v2 hierarchies.
   120  	//
   121  	// See man 7 cgroups for more details.
   122  	//
   123  	// hierarchy-ID and controller-list have relatively small maximum
   124  	// sizes, and the path can be up to _PATH_MAX, so we need a bit more
   125  	// than 1 _PATH_MAX of scratch space.
   126  
   127  	l := newLineReader(fd, scratch, read)
   128  
   129  	// Bytes written to out.
   130  	n := 0
   131  
   132  	for {
   133  		err := l.next()
   134  		if err == errIncompleteLine {
   135  			// Don't allow incomplete lines. While in theory the
   136  			// incomplete line may be for a controller we don't
   137  			// care about, in practice all lines should be of
   138  			// similar length, so we should just have a buffer big
   139  			// enough for any.
   140  			return 0, 0, err
   141  		} else if err == errEOF {
   142  			break
   143  		} else if err != nil {
   144  			return 0, 0, err
   145  		}
   146  
   147  		line := l.line()
   148  
   149  		// The format of each line is
   150  		//
   151  		//   hierarchy-ID:controller-list:cgroup-path
   152  		//
   153  		// controller-list is comma-separated.
   154  		// See man 7 cgroups for more details.
   155  		i := bytealg.IndexByte(line, ':')
   156  		if i < 0 {
   157  			return 0, 0, errMalformedFile
   158  		}
   159  
   160  		hierarchy := line[:i]
   161  		line = line[i+1:]
   162  
   163  		i = bytealg.IndexByte(line, ':')
   164  		if i < 0 {
   165  			return 0, 0, errMalformedFile
   166  		}
   167  
   168  		controllers := line[:i]
   169  		line = line[i+1:]
   170  
   171  		path := line
   172  		if len(path) == 0 || path[0] != '/' {
   173  			// We rely on this when composing the full path.
   174  			return 0, 0, errMalformedFile
   175  		}
   176  		if len(path) > len(out) {
   177  			// Should not be possible. If we really get a very long cgroup path,
   178  			// read /proc/self/cgroup will fail with ENAMETOOLONG.
   179  			return 0, 0, errPathTooLong
   180  		}
   181  
   182  		if string(hierarchy) == "0" {
   183  			// v2 hierarchy.
   184  			n = copy(out, path)
   185  			// Keep searching, we might find a v1 hierarchy with a
   186  			// CPU controller, which takes precedence.
   187  		} else {
   188  			// v1 hierarchy
   189  			if containsCPU(controllers) {
   190  				// Found a v1 CPU controller. This must be the
   191  				// only one, so we're done.
   192  				return copy(out, path), V1, nil
   193  			}
   194  		}
   195  	}
   196  
   197  	if n == 0 {
   198  		// Found nothing.
   199  		return 0, 0, ErrNoCgroup
   200  	}
   201  
   202  	// Must be v2, v1 returns above.
   203  	return n, V2, nil
   204  }
   205  
   206  // Returns true if comma-separated list b contains "cpu".
   207  func containsCPU(b []byte) bool {
   208  	for len(b) > 0 {
   209  		i := bytealg.IndexByte(b, ',')
   210  		if i < 0 {
   211  			// Neither cmd/compile nor gccgo allocates for these string conversions.
   212  			return string(b) == "cpu"
   213  		}
   214  
   215  		curr := b[:i]
   216  		rest := b[i+1:]
   217  
   218  		if string(curr) == "cpu" {
   219  			return true
   220  		}
   221  
   222  		b = rest
   223  	}
   224  
   225  	return false
   226  }
   227  
   228  // Returns the path to the specified cgroup and version with cpu controller
   229  //
   230  // fd is a file descriptor for /proc/self/mountinfo.
   231  // Returns the number of bytes written.
   232  func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out, cgroup []byte, version Version, scratch []byte) (int, error) {
   233  	// The format of each line is:
   234  	//
   235  	// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
   236  	// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
   237  	//
   238  	// (1) mount ID:  unique identifier of the mount (may be reused after umount)
   239  	// (2) parent ID:  ID of parent (or of self for the top of the mount tree)
   240  	// (3) major:minor:  value of st_dev for files on filesystem
   241  	// (4) root:  root of the mount within the filesystem
   242  	// (5) mount point:  mount point relative to the process's root
   243  	// (6) mount options:  per mount options
   244  	// (7) optional fields:  zero or more fields of the form "tag[:value]"
   245  	// (8) separator:  marks the end of the optional fields
   246  	// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
   247  	// (10) mount source:  filesystem specific information or "none"
   248  	// (11) super options:  per super block options
   249  	//
   250  	// See man 5 proc_pid_mountinfo for more details.
   251  	//
   252  	// Note that emitted paths will not contain space, tab, newline, or
   253  	// carriage return. Those are escaped. See Linux show_mountinfo ->
   254  	// show_path. We must unescape before returning.
   255  	//
   256  	// A mount point matches if the filesystem type (9) is cgroup2,
   257  	// or cgroup with "cpu" in the super options (11),
   258  	// and the cgroup is in the root (4). If there are multiple matches,
   259  	// the first one is selected.
   260  	//
   261  	// We return full cgroup path, which is the mount point (5) +
   262  	// cgroup parameter without the root (4) prefix.
   263  	//
   264  	// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
   265  	// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
   266  	// Note that non-cgroup mounts may have arbitrarily long (11), but we
   267  	// can skip those when parsing.
   268  
   269  	l := newLineReader(fd, scratch, read)
   270  
   271  	for {
   272  		err := l.next()
   273  		if err == errIncompleteLine {
   274  			// An incomplete line is fine as long as it doesn't
   275  			// impede parsing the fields we need. It shouldn't be
   276  			// possible for any mount to use more than 3*PATH_MAX
   277  			// before (9) because there are two paths and all other
   278  			// earlier fields have bounded options. Only (11) has
   279  			// unbounded options.
   280  		} else if err == errEOF {
   281  			break
   282  		} else if err != nil {
   283  			return 0, err
   284  		}
   285  
   286  		line := l.line()
   287  
   288  		// Skip first three fields.
   289  		for range 3 {
   290  			i := bytealg.IndexByte(line, ' ')
   291  			if i < 0 {
   292  				return 0, errMalformedFile
   293  			}
   294  			line = line[i+1:]
   295  		}
   296  
   297  		// (4) root:  root of the mount within the filesystem
   298  		i := bytealg.IndexByte(line, ' ')
   299  		if i < 0 {
   300  			return 0, errMalformedFile
   301  		}
   302  		root := line[:i]
   303  		if len(root) == 0 || root[0] != '/' {
   304  			// We rely on this in hasPathPrefix.
   305  			return 0, errMalformedFile
   306  		}
   307  		line = line[i+1:]
   308  
   309  		// (5) mount point:  mount point relative to the process's root
   310  		i = bytealg.IndexByte(line, ' ')
   311  		if i < 0 {
   312  			return 0, errMalformedFile
   313  		}
   314  		mnt := line[:i]
   315  		line = line[i+1:]
   316  
   317  		// Skip ahead past optional fields, delimited by " - ".
   318  		for {
   319  			i = bytealg.IndexByte(line, ' ')
   320  			if i < 0 {
   321  				return 0, errMalformedFile
   322  			}
   323  			if i+3 >= len(line) {
   324  				return 0, errMalformedFile
   325  			}
   326  			delim := line[i : i+3]
   327  			if string(delim) == " - " {
   328  				line = line[i+3:]
   329  				break
   330  			}
   331  			line = line[i+1:]
   332  		}
   333  
   334  		// (9) filesystem type:  name of filesystem of the form "type[.subtype]"
   335  		i = bytealg.IndexByte(line, ' ')
   336  		if i < 0 {
   337  			return 0, errMalformedFile
   338  		}
   339  		ftype := line[:i]
   340  		line = line[i+1:]
   341  
   342  		switch version {
   343  		case V1:
   344  			if string(ftype) != "cgroup" {
   345  				continue
   346  			}
   347  			// (10) mount source:  filesystem specific information or "none"
   348  			i = bytealg.IndexByte(line, ' ')
   349  			if i < 0 {
   350  				return 0, errMalformedFile
   351  			}
   352  			// Don't care about mount source.
   353  			line = line[i+1:]
   354  
   355  			// (11) super options:  per super block options
   356  			if !containsCPU(line) {
   357  				continue
   358  			}
   359  		case V2:
   360  			if string(ftype) != "cgroup2" {
   361  				continue
   362  			}
   363  		default:
   364  			throw("impossible cgroup version")
   365  			panic("unreachable")
   366  		}
   367  
   368  		// Check cgroup is in the root.
   369  		// If the cgroup is /sandbox/container, the matching mount point root could be
   370  		// /sandbox/container, /sandbox, or /
   371  		rootLen, err := unescapePath(root, root)
   372  		if err != nil {
   373  			return 0, err
   374  		}
   375  		root = root[:rootLen]
   376  		if !hasPathPrefix(cgroup, root) {
   377  			continue // not matched, this is not the mount point we're looking for
   378  		}
   379  
   380  		// Cutoff the root from cgroup, ensure rel starts with '/' or is empty.
   381  		rel := cgroup[rootLen:]
   382  		if rootLen == 1 && len(cgroup) > 1 {
   383  			// root is "/", but cgroup is not. Keep full cgroup path.
   384  			rel = cgroup
   385  		}
   386  		if hasPathPrefix(rel, []byte("/..")) {
   387  			// the cgroup is out of current cgroup namespace, and this mount point
   388  			// cannot reach that cgroup.
   389  			//
   390  			// e.g. If the process is in cgroup /init, but in a cgroup namespace
   391  			// rooted at /sandbox/container, /proc/self/cgroup will show /../../init.
   392  			// we can reach it if the mount point root is
   393  			// /../.. or /../../init, but not if it is /.. or /
   394  			// While mount point with root /../../.. should able to reach the cgroup,
   395  			// we don't know the path to the cgroup within that mount point.
   396  			continue
   397  		}
   398  
   399  		// All conditions met, compose the full path.
   400  		// Copy rel to the correct place first, it may overlap with out.
   401  		n := unescapedLen(mnt)
   402  		if n+len(rel) > len(out) {
   403  			return 0, errPathTooLong
   404  		}
   405  		copy(out[n:], rel)
   406  		n2, err := unescapePath(out[:n], mnt)
   407  		if err != nil {
   408  			return 0, err
   409  		}
   410  		if n2 != n {
   411  			throw("wrong unescaped len")
   412  		}
   413  		return n + len(rel), nil
   414  	}
   415  
   416  	// Found nothing.
   417  	return 0, ErrNoCgroup
   418  }
   419  
   420  func hasPathPrefix(p, prefix []byte) bool {
   421  	i := len(prefix)
   422  	if i == 1 {
   423  		return true // root contains everything
   424  	}
   425  	if len(p) < i || !bytealg.Equal(prefix, p[:i]) {
   426  		return false
   427  	}
   428  	return len(p) == i || p[i] == '/' // must match at path boundary
   429  }
   430  
   431  var (
   432  	errInvalidEscape error = stringError("invalid path escape sequence")
   433  	errPathTooLong   error = stringError("path too long")
   434  )
   435  
   436  func unescapedLen(in []byte) int {
   437  	return len(in) - bytealg.Count(in, byte('\\'))*3
   438  }
   439  
   440  // unescapePath copies in to out, unescaping escape sequences generated by
   441  // Linux's show_path.
   442  //
   443  // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
   444  // like '\040' for space.
   445  //
   446  // Caller must ensure that out at least has unescapedLen(in) bytes.
   447  // in and out may alias; in-place unescaping is supported.
   448  //
   449  // Returns the number of bytes written to out.
   450  //
   451  // Also see escapePath in cgroup_linux_test.go.
   452  func unescapePath(out []byte, in []byte) (int, error) {
   453  	var outi, ini int
   454  	for ini < len(in) {
   455  		if outi >= len(out) {
   456  			// given that caller already ensured out is long enough, this
   457  			// is only possible if there are malformed escape sequences
   458  			// we have not parsed yet.
   459  			return outi, errInvalidEscape
   460  		}
   461  		c := in[ini]
   462  		if c != '\\' {
   463  			out[outi] = c
   464  			outi++
   465  			ini++
   466  			continue
   467  		}
   468  
   469  		// Start of escape sequence.
   470  
   471  		// Escape sequence is always 4 characters: one slash and three
   472  		// digits.
   473  		if ini+3 >= len(in) {
   474  			return outi, errInvalidEscape
   475  		}
   476  
   477  		var outc int
   478  		for i := range 3 {
   479  			c := in[ini+1+i]
   480  			if c < '0' || c > '7' {
   481  				return outi, errInvalidEscape
   482  			}
   483  
   484  			outc *= 8
   485  			outc += int(c - '0')
   486  		}
   487  
   488  		if outc > 0xFF {
   489  			return outi, errInvalidEscape
   490  		}
   491  		out[outi] = byte(outc)
   492  		outi++
   493  
   494  		ini += 4
   495  	}
   496  
   497  	return outi, nil
   498  }
   499  

View as plain text