Source file src/internal/runtime/cgroup/cgroup.go
1 // Copyright 2025 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package cgroup 6 7 import ( 8 "internal/bytealg" 9 "internal/strconv" 10 ) 11 12 var ( 13 ErrNoCgroup error = stringError("not in a cgroup") 14 15 errMalformedFile error = stringError("malformed file") 16 ) 17 18 const _PATH_MAX = 4096 19 20 const ( 21 // Required amount of scratch space for CPULimit. 22 // 23 // TODO(prattmic): This is shockingly large (~70KiB) due to the (very 24 // unlikely) combination of extremely long paths consisting mostly 25 // escaped characters. The scratch buffer ends up in .bss in package 26 // runtime, so it doesn't contribute to binary size and generally won't 27 // be faulted in, but it would still be nice to shrink this. A more 28 // complex parser that did not need to keep entire lines in memory 29 // could get away with much less. Alternatively, we could do a one-off 30 // mmap allocation for this buffer, which is only mapped larger if we 31 // actually need the extra space. 32 ScratchSize = PathSize + ParseSize 33 34 // Required space to store a path of the cgroup in the filesystem. 35 PathSize = _PATH_MAX 36 37 // /proc/self/mountinfo path escape sequences are 4 characters long, so 38 // a path consisting entirely of escaped characters could be 4 times 39 // larger. 40 escapedPathMax = 4 * _PATH_MAX 41 42 // Required space to parse /proc/self/mountinfo and /proc/self/cgroup. 43 // See findCPUMount and findCPURelativePath. 44 ParseSize = 4 * escapedPathMax 45 ) 46 47 // Version indicates the cgroup version. 48 type Version int 49 50 const ( 51 VersionUnknown Version = iota 52 V1 53 V2 54 ) 55 56 func parseV1Number(buf []byte) (int64, error) { 57 // Ignore trailing newline. 58 i := bytealg.IndexByte(buf, '\n') 59 if i < 0 { 60 return 0, errMalformedFile 61 } 62 buf = buf[:i] 63 64 val, err := strconv.ParseInt(string(buf), 10, 64) 65 if err != nil { 66 return 0, errMalformedFile 67 } 68 69 return val, nil 70 } 71 72 func parseV2Limit(buf []byte) (float64, bool, error) { 73 i := bytealg.IndexByte(buf, ' ') 74 if i < 0 { 75 return 0, false, errMalformedFile 76 } 77 78 quotaStr := buf[:i] 79 if bytealg.Compare(quotaStr, []byte("max")) == 0 { 80 // No limit. 81 return 0, false, nil 82 } 83 84 periodStr := buf[i+1:] 85 // Ignore trailing newline, if any. 86 i = bytealg.IndexByte(periodStr, '\n') 87 if i < 0 { 88 return 0, false, errMalformedFile 89 } 90 periodStr = periodStr[:i] 91 92 quota, err := strconv.ParseInt(string(quotaStr), 10, 64) 93 if err != nil { 94 return 0, false, errMalformedFile 95 } 96 97 period, err := strconv.ParseInt(string(periodStr), 10, 64) 98 if err != nil { 99 return 0, false, errMalformedFile 100 } 101 102 return float64(quota) / float64(period), true, nil 103 } 104 105 // Finds the path of the current process's CPU cgroup and writes it to out. 106 // 107 // fd is a file descriptor for /proc/self/cgroup. 108 // Returns the number of bytes written and the cgroup version (1 or 2). 109 func parseCPUCgroup(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) { 110 // The format of each line is 111 // 112 // hierarchy-ID:controller-list:cgroup-path 113 // 114 // controller-list is comma-separated. 115 // 116 // cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that 117 // is the CPU controller. Otherwise the v2 hierarchy (if any) is the 118 // CPU controller. It is not possible to mount the same controller 119 // simultaneously under both the v1 and the v2 hierarchies. 120 // 121 // See man 7 cgroups for more details. 122 // 123 // hierarchy-ID and controller-list have relatively small maximum 124 // sizes, and the path can be up to _PATH_MAX, so we need a bit more 125 // than 1 _PATH_MAX of scratch space. 126 127 l := newLineReader(fd, scratch, read) 128 129 // Bytes written to out. 130 n := 0 131 132 for { 133 err := l.next() 134 if err == errIncompleteLine { 135 // Don't allow incomplete lines. While in theory the 136 // incomplete line may be for a controller we don't 137 // care about, in practice all lines should be of 138 // similar length, so we should just have a buffer big 139 // enough for any. 140 return 0, 0, err 141 } else if err == errEOF { 142 break 143 } else if err != nil { 144 return 0, 0, err 145 } 146 147 line := l.line() 148 149 // The format of each line is 150 // 151 // hierarchy-ID:controller-list:cgroup-path 152 // 153 // controller-list is comma-separated. 154 // See man 7 cgroups for more details. 155 i := bytealg.IndexByte(line, ':') 156 if i < 0 { 157 return 0, 0, errMalformedFile 158 } 159 160 hierarchy := line[:i] 161 line = line[i+1:] 162 163 i = bytealg.IndexByte(line, ':') 164 if i < 0 { 165 return 0, 0, errMalformedFile 166 } 167 168 controllers := line[:i] 169 line = line[i+1:] 170 171 path := line 172 if len(path) == 0 || path[0] != '/' { 173 // We rely on this when composing the full path. 174 return 0, 0, errMalformedFile 175 } 176 if len(path) > len(out) { 177 // Should not be possible. If we really get a very long cgroup path, 178 // read /proc/self/cgroup will fail with ENAMETOOLONG. 179 return 0, 0, errPathTooLong 180 } 181 182 if string(hierarchy) == "0" { 183 // v2 hierarchy. 184 n = copy(out, path) 185 // Keep searching, we might find a v1 hierarchy with a 186 // CPU controller, which takes precedence. 187 } else { 188 // v1 hierarchy 189 if containsCPU(controllers) { 190 // Found a v1 CPU controller. This must be the 191 // only one, so we're done. 192 return copy(out, path), V1, nil 193 } 194 } 195 } 196 197 if n == 0 { 198 // Found nothing. 199 return 0, 0, ErrNoCgroup 200 } 201 202 // Must be v2, v1 returns above. 203 return n, V2, nil 204 } 205 206 // Returns true if comma-separated list b contains "cpu". 207 func containsCPU(b []byte) bool { 208 for len(b) > 0 { 209 i := bytealg.IndexByte(b, ',') 210 if i < 0 { 211 // Neither cmd/compile nor gccgo allocates for these string conversions. 212 return string(b) == "cpu" 213 } 214 215 curr := b[:i] 216 rest := b[i+1:] 217 218 if string(curr) == "cpu" { 219 return true 220 } 221 222 b = rest 223 } 224 225 return false 226 } 227 228 // Returns the path to the specified cgroup and version with cpu controller 229 // 230 // fd is a file descriptor for /proc/self/mountinfo. 231 // Returns the number of bytes written. 232 func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out, cgroup []byte, version Version, scratch []byte) (int, error) { 233 // The format of each line is: 234 // 235 // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue 236 // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) 237 // 238 // (1) mount ID: unique identifier of the mount (may be reused after umount) 239 // (2) parent ID: ID of parent (or of self for the top of the mount tree) 240 // (3) major:minor: value of st_dev for files on filesystem 241 // (4) root: root of the mount within the filesystem 242 // (5) mount point: mount point relative to the process's root 243 // (6) mount options: per mount options 244 // (7) optional fields: zero or more fields of the form "tag[:value]" 245 // (8) separator: marks the end of the optional fields 246 // (9) filesystem type: name of filesystem of the form "type[.subtype]" 247 // (10) mount source: filesystem specific information or "none" 248 // (11) super options: per super block options 249 // 250 // See man 5 proc_pid_mountinfo for more details. 251 // 252 // Note that emitted paths will not contain space, tab, newline, or 253 // carriage return. Those are escaped. See Linux show_mountinfo -> 254 // show_path. We must unescape before returning. 255 // 256 // A mount point matches if the filesystem type (9) is cgroup2, 257 // or cgroup with "cpu" in the super options (11), 258 // and the cgroup is in the root (4). If there are multiple matches, 259 // the first one is selected. 260 // 261 // We return full cgroup path, which is the mount point (5) + 262 // cgroup parameter without the root (4) prefix. 263 // 264 // (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a 265 // small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space. 266 // Note that non-cgroup mounts may have arbitrarily long (11), but we 267 // can skip those when parsing. 268 269 l := newLineReader(fd, scratch, read) 270 271 for { 272 err := l.next() 273 if err == errIncompleteLine { 274 // An incomplete line is fine as long as it doesn't 275 // impede parsing the fields we need. It shouldn't be 276 // possible for any mount to use more than 3*PATH_MAX 277 // before (9) because there are two paths and all other 278 // earlier fields have bounded options. Only (11) has 279 // unbounded options. 280 } else if err == errEOF { 281 break 282 } else if err != nil { 283 return 0, err 284 } 285 286 line := l.line() 287 288 // Skip first three fields. 289 for range 3 { 290 i := bytealg.IndexByte(line, ' ') 291 if i < 0 { 292 return 0, errMalformedFile 293 } 294 line = line[i+1:] 295 } 296 297 // (4) root: root of the mount within the filesystem 298 i := bytealg.IndexByte(line, ' ') 299 if i < 0 { 300 return 0, errMalformedFile 301 } 302 root := line[:i] 303 if len(root) == 0 || root[0] != '/' { 304 // We rely on this in hasPathPrefix. 305 return 0, errMalformedFile 306 } 307 line = line[i+1:] 308 309 // (5) mount point: mount point relative to the process's root 310 i = bytealg.IndexByte(line, ' ') 311 if i < 0 { 312 return 0, errMalformedFile 313 } 314 mnt := line[:i] 315 line = line[i+1:] 316 317 // Skip ahead past optional fields, delimited by " - ". 318 for { 319 i = bytealg.IndexByte(line, ' ') 320 if i < 0 { 321 return 0, errMalformedFile 322 } 323 if i+3 >= len(line) { 324 return 0, errMalformedFile 325 } 326 delim := line[i : i+3] 327 if string(delim) == " - " { 328 line = line[i+3:] 329 break 330 } 331 line = line[i+1:] 332 } 333 334 // (9) filesystem type: name of filesystem of the form "type[.subtype]" 335 i = bytealg.IndexByte(line, ' ') 336 if i < 0 { 337 return 0, errMalformedFile 338 } 339 ftype := line[:i] 340 line = line[i+1:] 341 342 switch version { 343 case V1: 344 if string(ftype) != "cgroup" { 345 continue 346 } 347 // (10) mount source: filesystem specific information or "none" 348 i = bytealg.IndexByte(line, ' ') 349 if i < 0 { 350 return 0, errMalformedFile 351 } 352 // Don't care about mount source. 353 line = line[i+1:] 354 355 // (11) super options: per super block options 356 if !containsCPU(line) { 357 continue 358 } 359 case V2: 360 if string(ftype) != "cgroup2" { 361 continue 362 } 363 default: 364 throw("impossible cgroup version") 365 panic("unreachable") 366 } 367 368 // Check cgroup is in the root. 369 // If the cgroup is /sandbox/container, the matching mount point root could be 370 // /sandbox/container, /sandbox, or / 371 rootLen, err := unescapePath(root, root) 372 if err != nil { 373 return 0, err 374 } 375 root = root[:rootLen] 376 if !hasPathPrefix(cgroup, root) { 377 continue // not matched, this is not the mount point we're looking for 378 } 379 380 // Cutoff the root from cgroup, ensure rel starts with '/' or is empty. 381 rel := cgroup[rootLen:] 382 if rootLen == 1 && len(cgroup) > 1 { 383 // root is "/", but cgroup is not. Keep full cgroup path. 384 rel = cgroup 385 } 386 if hasPathPrefix(rel, []byte("/..")) { 387 // the cgroup is out of current cgroup namespace, and this mount point 388 // cannot reach that cgroup. 389 // 390 // e.g. If the process is in cgroup /init, but in a cgroup namespace 391 // rooted at /sandbox/container, /proc/self/cgroup will show /../../init. 392 // we can reach it if the mount point root is 393 // /../.. or /../../init, but not if it is /.. or / 394 // While mount point with root /../../.. should able to reach the cgroup, 395 // we don't know the path to the cgroup within that mount point. 396 continue 397 } 398 399 // All conditions met, compose the full path. 400 // Copy rel to the correct place first, it may overlap with out. 401 n := unescapedLen(mnt) 402 if n+len(rel) > len(out) { 403 return 0, errPathTooLong 404 } 405 copy(out[n:], rel) 406 n2, err := unescapePath(out[:n], mnt) 407 if err != nil { 408 return 0, err 409 } 410 if n2 != n { 411 throw("wrong unescaped len") 412 } 413 return n + len(rel), nil 414 } 415 416 // Found nothing. 417 return 0, ErrNoCgroup 418 } 419 420 func hasPathPrefix(p, prefix []byte) bool { 421 i := len(prefix) 422 if i == 1 { 423 return true // root contains everything 424 } 425 if len(p) < i || !bytealg.Equal(prefix, p[:i]) { 426 return false 427 } 428 return len(p) == i || p[i] == '/' // must match at path boundary 429 } 430 431 var ( 432 errInvalidEscape error = stringError("invalid path escape sequence") 433 errPathTooLong error = stringError("path too long") 434 ) 435 436 func unescapedLen(in []byte) int { 437 return len(in) - bytealg.Count(in, byte('\\'))*3 438 } 439 440 // unescapePath copies in to out, unescaping escape sequences generated by 441 // Linux's show_path. 442 // 443 // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences, 444 // like '\040' for space. 445 // 446 // Caller must ensure that out at least has unescapedLen(in) bytes. 447 // in and out may alias; in-place unescaping is supported. 448 // 449 // Returns the number of bytes written to out. 450 // 451 // Also see escapePath in cgroup_linux_test.go. 452 func unescapePath(out []byte, in []byte) (int, error) { 453 var outi, ini int 454 for ini < len(in) { 455 if outi >= len(out) { 456 // given that caller already ensured out is long enough, this 457 // is only possible if there are malformed escape sequences 458 // we have not parsed yet. 459 return outi, errInvalidEscape 460 } 461 c := in[ini] 462 if c != '\\' { 463 out[outi] = c 464 outi++ 465 ini++ 466 continue 467 } 468 469 // Start of escape sequence. 470 471 // Escape sequence is always 4 characters: one slash and three 472 // digits. 473 if ini+3 >= len(in) { 474 return outi, errInvalidEscape 475 } 476 477 var outc int 478 for i := range 3 { 479 c := in[ini+1+i] 480 if c < '0' || c > '7' { 481 return outi, errInvalidEscape 482 } 483 484 outc *= 8 485 outc += int(c - '0') 486 } 487 488 if outc > 0xFF { 489 return outi, errInvalidEscape 490 } 491 out[outi] = byte(outc) 492 outi++ 493 494 ini += 4 495 } 496 497 return outi, nil 498 } 499