diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ce0178aca..10c6f31cf4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased 1.2.z] +### Fixed + * Container processes will no longer inherit the CPU affinity of runc by + default. Instead, the default CPU affinity of container processes will be + the largest set of CPUs permitted by the container's cpuset cgroup and any + other system restrictions (such as isolated CPUs). (#4041, #4815, #4858) + ## [1.2.6] - 2025-03-17 > Hasta la victoria, siempre. diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 18a5a2bfbea..8dab3caefa1 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -122,6 +122,46 @@ func (p *setnsProcess) signal(sig os.Signal) error { return unix.Kill(p.pid(), s) } +// tryResetCPUAffinity tries to reset the CPU affinity of the process +// identified by pid to include all possible CPUs (notwithstanding cgroup +// cpuset restrictions and isolated CPUs). +func tryResetCPUAffinity(pid int) { + // When resetting the CPU affinity, we want to match the configured cgroup + // cpuset (or the default set of all CPUs, if no cpuset is configured) + // rather than some more restrictive affinity we were spawned in (such as + // one that may have been inherited from systemd). The cpuset cgroup used + // to reconfigure the cpumask automatically for joining processes, but + // kcommit da019032819a ("sched: Enforce user requested affinity") changed + // this behaviour in Linux 6.2. + // + // Parsing cpuset.cpus.effective is quite inefficient (and looking at + // things like /proc/stat would be wrong for most nested containers), but + // luckily sched_setaffinity(2) will implicitly: + // + // * Clamp the cpumask so that it matches the current number of CPUs on + // the system. + // * Mask out any CPUs that are not a member of the target task's + // configured cgroup cpuset. + // + // So we can just pass a very large array of set cpumask bits and the + // kernel will silently convert that to the correct value very cheaply. + + // Ideally, we would just set the array to 0xFF...FF. Unfortunately, the + // size depends on the architecture. It is also a private newtype, so we + // can't use (^0) or generics since those require us to be able to name the + // type. However, we can just underflow the zero value instead. + // TODO: Once is merged, switch to that. + cpuset := unix.CPUSet{} + for i := range cpuset { + cpuset[i]-- // underflow to 0xFF..FF + } + if err := unix.SchedSetaffinity(pid, &cpuset); err != nil { + logrus.WithError( + os.NewSyscallError("sched_setaffinity", err), + ).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + } +} + func (p *setnsProcess) start() (retErr error) { defer p.comm.closeParent() @@ -184,6 +224,9 @@ func (p *setnsProcess) start() (retErr error) { } } } + // Reset the CPU affinity after cgroups are configured to make sure it + // matches any configured cpuset. + tryResetCPUAffinity(p.pid()) if p.intelRdtPath != "" { // if Intel RDT "resource control" filesystem path exists _, err := os.Stat(p.intelRdtPath) @@ -578,6 +621,9 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("unable to apply cgroup configuration: %w", err) } } + // Reset the CPU affinity after cgroups are configured to make sure it + // matches any configured cpuset. + tryResetCPUAffinity(p.pid()) if p.intelRdtManager != nil { if err := p.intelRdtManager.Apply(p.pid()); err != nil { return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats new file mode 100644 index 00000000000..5df65374352 --- /dev/null +++ b/tests/integration/cpu_affinity.bats @@ -0,0 +1,127 @@ +#!/usr/bin/env bats +# Exec CPU affinity tests. For more details, see: +# - https://github.com/opencontainers/runtime-spec/pull/1253 + +load helpers + +INITIAL_CPU_MASK="$(grep -F Cpus_allowed_list: /proc/self/status | awk '{ print $2 }')" + +function setup() { + requires smp cgroups_cpuset + setup_busybox + + echo "Initial CPU mask: $INITIAL_CPU_MASK" >&2 + echo "---" >&2 +} + +function teardown() { + teardown_bundle +} + +function first_cpu() { + sed 's/[-,].*//g' &2 + echo "$cmdname $* (status=$status)" >&2 # shellcheck disable=SC2154 echo "$output" >&2 } +# Wrapper for runc. +function runc() { + CMDNAME="$(basename "$RUNC")" sane_run __runc "$@" +} + +function setup_runc_cmdline() { + RUNC_CMDLINE=("$RUNC") + [[ -v RUNC_USE_SYSTEMD ]] && RUNC_CMDLINE+=("--systemd-cgroup") + [[ -n "${ROOT:-}" ]] && RUNC_CMDLINE+=("--root" "$ROOT/state") + export RUNC_CMDLINE +} + # Raw wrapper for runc. function __runc() { - "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \ - ${ROOT:+--root "$ROOT/state"} "$@" + setup_runc_cmdline + "${RUNC_CMDLINE[@]}" "$@" } # Wrapper for runc spec.