From 53786fd941b572f1d16cead42d165850a16a58a6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 12 Feb 2026 09:12:03 +1100 Subject: [PATCH 1/5] procfs: move CACHED_PROC_HANDLE from global scope This is purely an implementation detail of ProcfsHandleBuilder::build and doesn't need to live in the global namespace. Signed-off-by: Aleksa Sarai --- src/procfs.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/procfs.rs b/src/procfs.rs index b646cd33..bec81eb6 100644 --- a/src/procfs.rs +++ b/src/procfs.rs @@ -232,9 +232,6 @@ pub struct ProcfsHandleBuilder { subset_pid: bool, } -// MSRV(1.70): Use std::sync::OnceLock. -static CACHED_PROCFS_HANDLE: OnceLock = OnceLock::new(); - impl Default for ProcfsHandleBuilder { fn default() -> Self { Self::new() @@ -341,6 +338,9 @@ impl ProcfsHandleBuilder { /// panic as this is not a state that should be possible to reach in regular /// program execution. pub fn build(self) -> Result { + // MSRV(1.70): Use std::sync::OnceLock. + static CACHED_PROCFS_HANDLE: OnceLock = OnceLock::new(); + // MSRV(1.85): Use let chain here (Rust 2024). if self.is_cache_friendly() { // If there is already a cached filesystem available, use that. From 30b964882a5ce8adf52440bfeb629ce093986717 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 12 Feb 2026 16:15:31 +1100 Subject: [PATCH 2/5] utils: fd: allow using fd 0 with FdExt::get_fdinfo_field There was an off-by-one when writing the check for valid file descriptor values, resulting in fd 0 not beign usable with FdExt::get_fdinfo_field. Signed-off-by: Aleksa Sarai --- src/utils/fd.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/fd.rs b/src/utils/fd.rs index 68d8dc41..8498310d 100644 --- a/src/utils/fd.rs +++ b/src/utils/fd.rs @@ -317,9 +317,9 @@ impl FdExt for Fd { { let fd = self.as_fd(); let fdinfo_path = match fd.as_raw_fd() { - // MSRV(1.66): Use ..=0 (half_open_range_patterns). + // MSRV(1.66): Use ..=-1 (half_open_range_patterns). // MSRV(1.80): Use ..0 (exclusive_range_pattern). - fd @ libc::AT_FDCWD | fd @ RawFd::MIN..=0 => Err(ErrorImpl::OsError { + fd @ libc::AT_FDCWD | fd @ RawFd::MIN..=-1 => Err(ErrorImpl::OsError { operation: format!("get relative procfs fdinfo path for fd {fd}").into(), source: IOError::from_raw_os_error(libc::EBADF), })?, From d91462f2763ecfea6dcb3f8e002152646a368558 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 12 Feb 2026 00:43:47 +1100 Subject: [PATCH 3/5] utils: add kernel version comparison code While we do need to have fallbacks if a feature is missing, and we need to handle the case where a feature (like openat2) is disabled via seccomp, it is handy to be able to have logic based on kernel version in some other cases. Signed-off-by: Aleksa Sarai --- Cargo.lock | 7 + Cargo.toml | 3 +- src/syscalls.rs | 30 +++ src/utils.rs | 2 + src/utils/kernel_version.rs | 411 ++++++++++++++++++++++++++++++++++++ 5 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 src/utils/kernel_version.rs diff --git a/Cargo.lock b/Cargo.lock index a00181b6..7812ef55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -382,6 +382,7 @@ dependencies = [ "rand", "rustix", "rustversion", + "scopeguard", "static_assertions", "tempfile", "thiserror 2.0.17", @@ -490,6 +491,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "serde" version = "1.0.228" diff --git a/Cargo.toml b/Cargo.toml index 7bc8dd18..7621dd91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,7 @@ once_cell = "1" # MSRV(1.65): Update to >=0.4.1 which uses let_else. 0.4.0 was broken. open-enum = { version = "0.3", optional = true } rand = { version = "0.9", optional = true } -rustix = { version = "1.1", features = ["fs", "process", "thread", "mount"] } +rustix = { version = "1.1", features = ["fs", "process", "system", "thread", "mount"] } rustversion = "1" thiserror = "2" static_assertions = "1.1" @@ -89,6 +89,7 @@ tempfile = "3" paste = "1" path-clean = "1" pretty_assertions = { version = "1.4.1", features = ["unstable"] } +scopeguard = "1" [build-dependencies] tempfile = "3" diff --git a/src/syscalls.rs b/src/syscalls.rs index 7ce7d3b6..ee357ef2 100644 --- a/src/syscalls.rs +++ b/src/syscalls.rs @@ -874,6 +874,36 @@ pub(crate) mod openat2 { pub(crate) use openat2::{openat2, openat2_follow, OpenHow, ResolveFlags}; +#[cfg(test)] +mod personality { + // musl doesn't expose UNAME26. + #[cfg(not(target_env = "musl"))] + pub(crate) const PER_UNAME26: u32 = libc::UNAME26 as _; + #[cfg(target_env = "musl")] + pub(crate) const PER_UNAME26: u32 = 0x0020000; /* */ + + pub(crate) fn personality(persona: Option) -> u32 { + unsafe { libc::personality(persona.unwrap_or(0xFFFF_FFFF) as _) as _ } + } + + /// Temporarily change the personality of the running thread. + /// + /// The personality is reset to the original persona value (i.e., when + /// [`scoped_personality`] was first called) once the returned `impl Drop` + /// value is dropped. Note that any threads or subprocesses spawned with + /// the `scoped_personality` guard held will permanently inherit the + /// specified persona. + #[must_use] + pub(crate) fn scoped_personality(persona: u32) -> impl Drop { + scopeguard::guard(personality(Some(persona)), |old_persona| { + personality(Some(old_persona)); + }) + } +} + +#[cfg(test)] +pub(crate) use personality::*; + #[cfg(test)] pub(crate) fn getpid() -> rustix_process::RawPid { rustix_process::Pid::as_raw(Some(rustix_process::getpid())) diff --git a/src/utils.rs b/src/utils.rs index a41361a9..fe4227ad 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -52,3 +52,5 @@ pub(crate) use maybe_owned::*; mod raw_procfs; pub(crate) use raw_procfs::*; + +pub(crate) mod kernel_version; diff --git a/src/utils/kernel_version.rs b/src/utils/kernel_version.rs new file mode 100644 index 00000000..1d4c4178 --- /dev/null +++ b/src/utils/kernel_version.rs @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: MPL-2.0 OR LGPL-3.0-or-later +/* + * libpathrs: safe path resolution on Linux + * Copyright (C) 2026 Aleksa Sarai + * + * == MPL-2.0 == + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + * + * Alternatively, this Source Code Form may also (at your option) be used + * under the terms of the GNU Lesser General Public License Version 3, as + * described below: + * + * == LGPL-3.0-or-later == + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + * Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +use std::{ + cmp::{self, Ordering}, + fmt, +}; + +use once_cell::sync::Lazy; +use rustix::system as rustix_system; + +/// A representation of a Linux kernel version that can be mutually compared +/// against, usually to check if a kernel aught to have a particular feature. +/// +/// Use [`parse_kernel_version`] to convert a kernel version string (such as +/// returned by `uname -r`) to a [`KernelVersion`]. +/// +/// # Comparisons # +/// +/// Note that the system for comparing kernel versions is not very akin to +/// SemVer because Linux kernel versions can (in principle) have arbitrarily +/// many dot components. If both kernel versions have the same number of +/// components, then the comparison is done left-to-right per-component in a +/// manner identical to the way [`slice`]s are compared. If the kernel versions +/// have different numbers of components then the comparison is done as though +/// the shorter kernel version was right-padded with additional `0` components. +/// +/// Thus, `3[.0.0] < 3.1[.0] < 3.1.18 < 4[.0.0]`. +#[derive(Clone, Debug)] +pub(crate) struct KernelVersion(pub(crate) Vec); + +impl fmt::Display for KernelVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "linux-{}", + self.0 + .iter() + .map(ToString::to_string) + .collect::>() + .join(".") + ) + } +} + +impl Ord for KernelVersion { + fn cmp(&self, other: &Self) -> Ordering { + // In contrast to regular slices, when comparing kernel versions we + // effectively zero-extend the trailing part of the shorter kernel + // version specification (so 3.1.2 > 3.1[.0] > 3[.0.0]). + let common_len = cmp::min(self.0.len(), other.0.len()); + match self.0[..common_len].cmp(&other.0[..common_len]) { + // We only need to deal with the annoying trailing end case if the + // common part is equal. + Ordering::Equal => match ( + self.0[common_len..].iter().any(|&n| n > 0), + other.0[common_len..].iter().any(|&n| n > 0), + ) { + (false, false) => Ordering::Equal, // all zeros or equal lengths + (true, false) => Ordering::Greater, // self tail > 0 + (false, true) => Ordering::Less, // other tail > 0 + (true, true) => unreachable!("both KernelVersion slices cannot have non-zero tails because one must be empty"), + }, + cmp => cmp, + } + } +} + +impl PartialOrd for KernelVersion { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for KernelVersion { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for KernelVersion {} + +/// Parse a kernel version string like `"4.10.23-1-default"` into a +/// [`KernelVersion`] that can be used for comparisons. +fn parse_kernel_version(kver_str: &str) -> Option { + // Strip off any of the non-version suffixes from the kver string. + let kver_str = &kver_str[..kver_str + .find(|ch: char| !ch.is_ascii_digit() && ch != '.') + .unwrap_or(kver_str.len())]; + + let kver = kver_str + .split('.') + .map(|num| match num { + "" => None, // version components must be non-empty + _ => num.parse().ok(), // version components must be valid numbers + }) + .collect::>>() + .map(KernelVersion); + + match kver { + // Versions must have >= 2 components (actually, in practice it's >= 3). + Some(KernelVersion(ref v)) if v.len() >= 2 => kver, + _ => None, + } +} + +// MSRV(1.80): Use LazyLock. +pub(crate) static HOST_KERNEL_VERSION: Lazy = Lazy::new(host_kernel_version); + +pub(crate) fn host_kernel_version() -> KernelVersion { + parse_kernel_version(&rustix_system::uname().release().to_string_lossy()) + .expect("uname kernel release must be a valid KernelVersion string") +} + +/// Returns whether the kernel version of the running system is at least as new +/// as the kernel version specified. See the documentation of [`KernelVersion`] +/// for more information on how kernel versions are compared. +macro_rules! is_gte { + ($($part:literal),+) => { + { + $( + const _: u64 = $part; + )+ + + // Some of our tests rely on using personality(2) to fake older + // (2.6-era) kernel versions, so we cannot use the cached kernel + // version when in test builds. However, to make sure that we still + // test that HOST_KERNEL_VERSION doesn't do anything silly (like + // crash) we still compute it in our tests. + let cached_host_kver = &*$crate::utils::kernel_version::HOST_KERNEL_VERSION; + #[cfg(test)] + let host_kver = &$crate::utils::kernel_version::host_kernel_version(); + #[cfg(not(test))] + let host_kver = cached_host_kver; + + #[cfg(test)] + eprintln!("cached host kernel is {cached_host_kver} but using {host_kver} for test"); + + let cmp_kver = &$crate::utils::kernel_version::KernelVersion(vec![$($part),+]); + + host_kver >= cmp_kver + } + }; +} +pub(crate) use is_gte; + +#[cfg(test)] +mod tests { + use super::*; + use crate::syscalls; + + use pretty_assertions::assert_eq; + + macro_rules! kver { + ($($part:literal),+) => { + KernelVersion(vec![$($part),+]) + }; + } + + #[test] + fn parse_kernel_version_bad() { + assert_eq!(parse_kernel_version(""), None); + assert_eq!(parse_kernel_version("."), None); + assert_eq!(parse_kernel_version(".."), None); + assert_eq!(parse_kernel_version(".-foo"), None); + assert_eq!(parse_kernel_version("1..3"), None); + assert_eq!(parse_kernel_version("4...2"), None); + + assert_eq!(parse_kernel_version("a.a.a"), None); + assert_eq!(parse_kernel_version("invalid"), None); + assert_eq!(parse_kernel_version("foo"), None); + assert_eq!(parse_kernel_version("foo.1.3.4"), None); + + assert_eq!(parse_kernel_version("-1.2"), None); + assert_eq!(parse_kernel_version("+1.2"), None); + + assert_eq!(parse_kernel_version("3a"), None); + + assert_eq!(parse_kernel_version(".1"), None); + assert_eq!(parse_kernel_version(".1.2"), None); + + assert_eq!(parse_kernel_version("3.foo"), None); + assert_eq!(parse_kernel_version("42.12."), None); + assert_eq!(parse_kernel_version("4.10.-default"), None); + assert_eq!(parse_kernel_version("4.10.-default"), None); + } + + #[test] + fn parse_kernel_version_good() { + assert_eq!(parse_kernel_version("3.7"), Some(kver![3, 7])); + assert_eq!(parse_kernel_version("3.8"), Some(kver![3, 8])); + assert_eq!(parse_kernel_version("3.8.0"), Some(kver![3, 8, 0])); + assert_eq!(parse_kernel_version("3.8.12"), Some(kver![3, 8, 12])); + assert_eq!( + parse_kernel_version("3.8.12.10.0.2.5912"), + Some(kver![3, 8, 12, 10, 0, 2, 5912]) + ); + assert_eq!( + parse_kernel_version("42.12.1000"), + Some(kver![42, 12, 1000]) + ); + } + + #[test] + fn parse_kernel_version_with_suffix() { + assert_eq!( + parse_kernel_version("2.6.16.48foobar"), + Some(kver![2, 6, 16, 48]) + ); + assert_eq!(parse_kernel_version("2.6.16f00b4r"), Some(kver![2, 6, 16])); + assert_eq!( + parse_kernel_version("2.6.16.13rc2"), + Some(kver![2, 6, 16, 13]) + ); + assert_eq!( + parse_kernel_version("2.6.16.13-rc2"), + Some(kver![2, 6, 16, 13]) + ); + assert_eq!( + parse_kernel_version("3.8.16-generic"), + Some(kver![3, 8, 16]) + ); + assert_eq!( + parse_kernel_version("6.12.49-1-default"), + Some(kver![6, 12, 49]) + ); + assert_eq!( + parse_kernel_version("4.9.27-default-foo.12.23"), + Some(kver![4, 9, 27]) + ); + assert_eq!(parse_kernel_version("5.15.0+debug"), Some(kver![5, 15, 0])); + assert_eq!(parse_kernel_version("6.1.0~beta1"), Some(kver![6, 1, 0])); + assert_eq!( + parse_kernel_version("5.4.0_custom.1.2"), + Some(kver![5, 4, 0]) + ); + assert_eq!(parse_kernel_version("3.8-4"), Some(kver![3, 8])); + } + + #[test] + fn kernel_version_eq_same_length() { + assert!(kver![3, 8] == kver![3, 8], "3.8 == 3.8"); + assert!(kver![3, 8, 12] == kver![3, 8, 12], "3.8.12 == 3.8.12"); + assert!(kver![0, 0] == kver![0, 0], "0.0 == 0.0"); + assert!(kver![6, 12, 49] == kver![6, 12, 49], "6.12.49 == 6.12.49"); + } + + #[test] + fn kernel_version_ne() { + assert!(kver![3, 8, 0] != kver![3, 8, 1], "3.8 != 3.8.1"); + assert!(kver![3, 8, 12] != kver![4, 8, 12], "3.8.12 != 4.8.12"); + assert!( + kver![6, 12, 49] != kver![6, 12, 49, 1], + "6.12.49 != 6.12.49.1" + ); + } + + #[test] + fn kernel_version_eq_trailing_zeros() { + // Trailing zeros should be treated as equal. + assert!(kver![3, 8] == kver![3, 8, 0], "3.8 == 3.8.0"); + assert!(kver![3, 8] == kver![3, 8, 0, 0], "3.8 == 3.8.0.0"); + assert!(kver![3, 8] == kver![3, 8, 0, 0, 0], "3.8 == 3.8.0.0.0"); + assert!(kver![3, 8, 0] == kver![3, 8, 0, 0], "3.8.0 == 3.8.0.0"); + assert!(kver![3, 8, 0] == kver![3, 8], "3.8.0 == 3.8"); + assert!(kver![5, 0, 0, 0] == kver![5, 0], "5.0.0.0 == 5.0"); + } + + #[test] + fn kernel_version_lt_same_length() { + assert!(kver![3, 7] < kver![3, 8], "3.7 < 3.8"); + assert!(kver![3, 8] < kver![4, 0], "3.8 < 4.0"); + assert!(kver![3, 8, 11] < kver![3, 8, 12], "3.8.11 < 3.8.12"); + assert!(kver![2, 6, 32] < kver![3, 0, 0], "2.6.32 < 3.0.0"); + assert!(kver![5, 9, 99] < kver![5, 10, 0], "5.9.99 < 5.10.0"); + } + + #[test] + fn kernel_version_gt_same_length() { + assert!(kver![3, 8] > kver![3, 7], "3.8 > 3.7"); + assert!(kver![4, 0] > kver![3, 8], "4.0 > 3.8"); + assert!(kver![3, 8, 12] > kver![3, 8, 11], "3.8.12 > 3.8.11"); + assert!(kver![6, 0, 0] > kver![5, 99, 99], "6.0.0 > 5.99.99"); + } + + #[test] + fn kernel_version_lt_different_length() { + // Shorter version is effectively zero-padded. + assert!(kver![3, 1] < kver![3, 1, 1], "3.1[.0] < 3.1.1"); + assert!(kver![3, 1] < kver![3, 1, 18], "3.1[.0] < 3.1.18"); + assert!(kver![3, 0] < kver![3, 0, 0, 1], "3.0[.0.0] < 3.0.0.1"); + assert!( + kver![5, 4] < kver![5, 4, 0, 0, 1], + "5.4[.0.0.0] < 5.4.0.0.1" + ); + } + + #[test] + fn kernel_version_gt_different_length() { + assert!(kver![3, 1, 1] > kver![3, 1], "3.1.1 > 3.1[.0]"); + assert!(kver![3, 1, 18] > kver![3, 1], "3.1.18 > 3.1[.0]"); + assert!(kver![3, 0, 0, 1] > kver![3, 0], "3.0.0.1 > 3.0[.0.0]"); + assert!( + kver![5, 4, 0, 0, 1] > kver![5, 4], + "5.4.0.0.1 > 5.4[.0.0.0]" + ); + } + + #[test] + fn kernel_version_ordering_chain() { + // Example from the doc comment. + assert!(kver![3, 0, 0] < kver![3, 1, 0], "3.0.0 < 3.1.0"); + assert!(kver![3, 1, 0] < kver![3, 1, 18], "3.1.0 < 3.1.18"); + assert!(kver![3, 1, 18] < kver![4, 0, 0], "3.1.18 < 4.0.0"); + + // Same example with implicit zeros. + assert!(kver![3] < kver![3, 1], "3[.0] < 3.1"); + assert!(kver![3, 1] < kver![3, 1, 18], "3.1[.0] < 3.1.18"); + assert!(kver![3, 1, 18] < kver![4, 0], "3.1.18 < 4.0[.0]"); + } + + #[test] + fn kernel_version_cmp_parsed() { + // Test comparison through parsed strings, like real kernel versions. + let v3_8 = parse_kernel_version("3.8.0-generic").expect("parse '3.8.0-generic'"); + let v4_10 = parse_kernel_version("4.10.23-1-default").expect("parse '4.10.23-1-default'"); + let v6_12 = parse_kernel_version("6.12.49-1-default").expect("parse '6.12.49-1-default'"); + + assert!(v3_8 < v4_10, "3.8.0 < 4.10.23"); + assert!(v4_10 < v6_12, "4.10.23 < 6.12.49"); + assert!(v3_8 < v6_12, "3.8.0 < 6.12.49"); + + // Parsed version with trailing zeros should equal shorter form. + let v5_4_0 = parse_kernel_version("5.4.0").expect("parse '5.4.0'"); + let v5_4 = parse_kernel_version("5.4").expect("parse '5.4'"); + assert!(v5_4_0 == v5_4, "5.4.0 == 5.4"); + } + + #[test] + fn kernel_version_gte() { + let v3_8 = parse_kernel_version("3.8.0-ubuntu.22.04").expect("parse '3.8.0-ubuntu.22.04"); + + assert!(kver![3, 8] >= v3_8, "3.8 >= 3.8.0-ubuntu.22.04"); + assert!(kver![3, 8, 1] >= v3_8, "3.8.1 >= 3.8.0-ubuntu.22.04"); + assert!(kver![3, 8, 0, 1] >= v3_8, "3.8.0.1 >= 3.8.0-ubuntu.22.04"); + assert!(kver![4] >= v3_8, "4 >= 3.8.0-ubuntu.22.04"); + assert!(kver![4, 0, 0, 0] >= v3_8, "4.0.0.0 >= 3.8.0-ubuntu.22.04"); + assert!(kver![3, 7, 999] < v3_8, "3.7.999 < 3.8.0-ubuntu.22.04"); + } + + #[test] + fn kernel_version_display() { + assert_eq!("linux-2", kver![2].to_string(), "Linux 2"); + assert_eq!( + "linux-2.6.32.182", + kver![2, 6, 32, 182].to_string(), + "Linux 2.6.32.182" + ); + assert_eq!("linux-3.0", kver![3, 0].to_string(), "Linux 3.0"); + assert_eq!( + "linux-4.1.18.2", + kver![4, 1, 18, 2].to_string(), + "Linux 4.1.18.2" + ); + } + + #[test] + fn host_kernel_version_uname26() { + // The UNAME26 personality lets us fake a pre-3.0 kernel version. + let _persona_guard = syscalls::scoped_personality(syscalls::PER_UNAME26); + + let host_kver = host_kernel_version(); + assert!( + kver![3, 0] > host_kver, + "UNAME26 personality should always result in a <3.0 kernel version: got {host_kver:?}" + ); + + assert!( + !is_gte!(3, 0), + "UNAME26 personality should always result in a <3.0 kernel version: is_gte!(3, 0) succeeded" + ); + } +} From 3ca67e1fe3ef7107038cae709836c1667d308851 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 12 Feb 2026 09:04:11 +1100 Subject: [PATCH 4/5] procfs: do not use backported fd-based mount API RHEL 8 backported the fd-based mount API to their 4.18 kernel, but based on testing in runc it seems there is a pretty serious performance bug in their backport that makes it unusable. To avoid using any such broken backports, we refuse to even try to use the new mount API if the running kernel is older than when the fd-based mount API was merged (Linux 5.2). This mirrors logic we have in pathrs-lite. Signed-off-by: Aleksa Sarai --- CHANGELOG.md | 6 ++++++ src/error.rs | 4 ++-- src/procfs.rs | 25 +++++++++++++++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fa419c0..fc22c1b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/). not a leak) but tools that search for file descriptor leaks (such as runc's test suite) could incorrectly classify this as a leak. We now close this `ProcfsBase` handle far more aggressively. +- RHEL 8 kernels have backports of the fd-based mount API (`fsopen(2)`, + `open_tree(2)`, et al.) but some `runc` testing found that they have very bad + (and very difficult to debug) performance issues. Thus, to avoid broken + backports libpathrs will now explicitly refuse to use the fd-based mount API + if the reported kernel version is pre-5.2 and will instead fallback to the + less-secure `open("/proc")`. [rust-issue20267]: https://github.com/rust-lang/rust/issues/20267 diff --git a/src/error.rs b/src/error.rs index 8225d8c9..e0eeffe0 100644 --- a/src/error.rs +++ b/src/error.rs @@ -92,7 +92,7 @@ pub(crate) enum ErrorImpl { #[error("feature {feature} is not implemented")] NotImplemented { feature: Cow<'static, str> }, - #[error("feature {feature} not supported on this kernel")] + #[error("feature {feature} not supported by the system")] NotSupported { feature: Cow<'static, str> }, #[error("invalid {name} argument: {description}")] @@ -220,7 +220,7 @@ impl ErrorKind { /// errno values where appropriate. pub(crate) fn errno(&self) -> Option { match self { - ErrorKind::NotImplemented => Some(libc::ENOSYS), + ErrorKind::NotImplemented | ErrorKind::NotSupported => Some(libc::ENOSYS), ErrorKind::InvalidArgument => Some(libc::EINVAL), #[cfg(feature = "capi")] ErrorKind::UnsupportedStructureData => Some(libc::E2BIG), diff --git a/src/procfs.rs b/src/procfs.rs index bec81eb6..db8256f4 100644 --- a/src/procfs.rs +++ b/src/procfs.rs @@ -63,7 +63,7 @@ use crate::{ flags::{OpenFlags, ResolverFlags}, resolvers::procfs::ProcfsResolver, syscalls, - utils::{self, FdExt, MaybeOwnedFd, RawProcfsRoot}, + utils::{self, kernel_version, FdExt, MaybeOwnedFd, RawProcfsRoot}, }; use std::{ @@ -76,7 +76,7 @@ use std::{ path::{Path, PathBuf}, }; -use once_cell::sync::OnceCell as OnceLock; +use once_cell::sync::{Lazy, OnceCell as OnceLock}; use rustix::{ fs::{self as rustix_fs, Access, AtFlags}, mount::{FsMountFlags, FsOpenFlags, MountAttrFlags, OpenTreeFlags}, @@ -806,11 +806,26 @@ impl<'fd> ProcfsHandleRef<'fd> { /// [lwn-procfs-overmounts]: https://lwn.net/Articles/934460/ pub type ProcfsHandle = ProcfsHandleRef<'static>; +/// Indicates whether this kernel is new enough that it should have the +/// upstream-merged version of the new mount API. This is necessary because +/// testing in runc found that RHEL 8 appears to have a broken backport of the +/// new mount API that causes serious performance regressions -- as such, we +/// should simply refuse to even try to use any of the new mount APIs on pre-5.2 +/// kernels. +// MSRV(1.80): Use LazyLock. +static HAS_UNBROKEN_MOUNT_API: Lazy = Lazy::new(|| kernel_version::is_gte!(5, 2)); + impl ProcfsHandle { /// Create a new `fsopen(2)`-based [`ProcfsHandle`]. This handle is safe /// against racing attackers changing the mount table and is guaranteed to /// have no overmounts because it is a brand-new procfs. pub(crate) fn new_fsopen(subset: bool) -> Result { + if !*HAS_UNBROKEN_MOUNT_API { + Err(ErrorImpl::NotSupported { + feature: "fsopen".into(), + })? + } + let sfd = syscalls::fsopen("proc", FsOpenFlags::FSOPEN_CLOEXEC).map_err(|err| { ErrorImpl::RawOsError { operation: "create procfs suberblock".into(), @@ -852,6 +867,12 @@ impl ProcfsHandle { /// guaranteed to be safe against racing attackers, and will not have /// overmounts unless `flags` contains `OpenTreeFlags::AT_RECURSIVE`. pub(crate) fn new_open_tree(flags: OpenTreeFlags) -> Result { + if !*HAS_UNBROKEN_MOUNT_API { + Err(ErrorImpl::NotSupported { + feature: "open_tree".into(), + })? + } + syscalls::open_tree( syscalls::BADFD, "/proc", From 96b00a1d08804fc35e49bce7e70db3428268fe86 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 12 Feb 2026 09:13:39 +1100 Subject: [PATCH 5/5] utils: fdinfo: do not require "ino" field for pre-5.14 kernels The "ino" field was added to /proc/$pid/fdinfo/$n in Linux 5.14 (by kcommit 3845f256a8b52 ("procfs/dmabuf: add inode number to /proc/*/fdinfo")), which means that we cannot require it exist on any kernels older than that. Unfortunately this does weaken the non-openat2 case somewhat (on pre-5.14 kernels, an attacker can now use a single /proc/self/environ file to fake any fdinfo) but there doesn't appear to be nice way of authenticating the fdinfo file we read from: * The inode number of fdinfo files is randomly generated and thus can't be used to figure out what fdinfo file it is, nor that it is an authentic fdinfo file at all. * The "flags" field would present no real barrier to attackers because in practice in libpathrs it will always be "02", and there isn't really * You could imagine using the "pos" field as a very rudimentary challenge-response mechanism (lseek() to a random offset in the fd and then check that the fdinfo contains the same offset in the "pos" field). The hope is that it would be too difficult for an attacker to mirror challenge in their fake fdinfo file. Unfortunately, this is not workable for several reasons: - We are almost always dealing with O_PATH file descriptors in libpathrs when looking at fdinfo, which means we cannot use lseek() in the first place. - The seek position is global state for the file descriptor so we would need to take a &mut version of BorrowedFd, but no such API really exists and &mut OwnedFd is very unergonomic. - Most importantly, this is _really_ ugly. Signed-off-by: Aleksa Sarai --- CHANGELOG.md | 8 +++ src/utils/fdinfo.rs | 131 ++++++++++++++++++++++++++++++++------------ 2 files changed, 103 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc22c1b5..e83e97f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,8 +40,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/). backports libpathrs will now explicitly refuse to use the fd-based mount API if the reported kernel version is pre-5.2 and will instead fallback to the less-secure `open("/proc")`. +- libpathrs [0.2.0][] added some `fdinfo`-based hardening to the procfs + resolver when `openat2` is not available. Unfortunately, one aspect of this + hardening had a hard requirement on [a kernel feature only added in Linux + 5.14][kcommit-3845f256a8b52] (namely the `ino` field in `fdinfo`) and thus + inadvertently increased our minimum kernel version requirement quite + significantly. This additional hardening is now only treated as mandatory if + the host kernel version is Linux 5.14 or newer. [rust-issue20267]: https://github.com/rust-lang/rust/issues/20267 +[kcommit-3845f256a8b52]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3845f256a8b527127bfbd4ced21e93d9e89aa6d7 ## [0.2.3] - 2026-01-29 ## diff --git a/src/utils/fdinfo.rs b/src/utils/fdinfo.rs index c0d94849..cb4275b0 100644 --- a/src/utils/fdinfo.rs +++ b/src/utils/fdinfo.rs @@ -32,7 +32,7 @@ use crate::{ error::{Error, ErrorExt, ErrorImpl, ErrorKind}, - utils::FdExt, + utils::{kernel_version, FdExt}, }; use std::{ @@ -100,44 +100,60 @@ where { let fd = fd.as_fd(); - // Verify that the "ino" field in fdinfo matches the real inode number - // of our file descriptor. This makes attacks harder (if not near - // impossible, outside of very constrained situations): + // Verify that the "ino" field in fdinfo matches the real inode number of + // our file descriptor. This makes attacks harder (if not near impossible, + // outside of very constrained situations): // - // * An attacker would probably struggle to always accurately guess the inode - // number of files that the process is trying to operate on. Yes, if they - // know the victim process's access patterns of procfs they could probably - // make an educated guess, but most files do not have stable inode numbers in - // procfs. + // * An attacker would probably struggle to always accurately guess the inode + // number of files that the process is trying to operate on. Yes, if they know + // the victim process's access patterns of procfs they could probably make an + // educated guess, but most files do not have stable inode numbers in procfs. // - // * An attacker can no longer bind-mount their own fdinfo directory with just - // a buch of handles to "/proc" open (assuming the attacker is trying to - // spoof "mnt_id"), because the inode numbers won't match. + // * An attacker can no longer bind-mount their own fdinfo directory with just a + // buch of handles to "/proc" open (assuming the attacker is trying to spoof + // "mnt_id"), because the inode numbers won't match. // - // They also can't really fake inode numbers in real procfs fdinfo - // files, so they would need to create fake fdinfo files using - // individual file arbitrary-data gadgets (like /proc/self/environ). - // However, every program only has one environment so they would need - // to create a new child process for every fd they are trying to - // attack simultaneously (and accurately update their environment - // data to avoid detection). + // They also can't really fake inode numbers in real procfs fdinfo files, + // so they would need to create fake fdinfo files using individual file + // arbitrary-data gadgets (like /proc/self/environ). However, every + // program only has one environment so they would need to create a new + // child process for every fd they are trying to attack simultaneously + // (and accurately update their environment data to avoid detection). // - // This isn't perfect protection by any means, but it's probably the - // best we can do for very old kernels (given the constraints). At the very - // least, it makes exploitation _much_ harder than if we didn't do anything - // at all. + // This isn't perfect protection by any means, but it's probably the best we + // can do for very old kernels (given the constraints). At the very least, + // it makes exploitation _much_ harder than if we didn't do anything at all. let actual_ino: u64 = fd.metadata().wrap("get inode number of fd")?.ino(); - let fdinfo_ino: u64 = + let fdinfo_ino: Option = match parse_and_find_fdinfo_field(rdr, "ino").map_err(|err| (err.kind(), err)) { - Ok(Some(ino)) => Ok(ino), - // "ino" *must* exist as a field -- make sure we return a - // SafetyViolation here if it is missing or an invalid value - // (InternalError), otherwise an attacker could silence this check - // by creating a "ino"-less fdinfo. + Ok(Some(ino)) => Ok(Some(ino)), + Ok(None) => { + // Unfortunately, the "ino" field in fdinfo was only added in + // Linux 5.14 (see kcommit 3845f256a8b52 ("procfs/dmabuf: add + // inode number to /proc/*/fdinfo")) and so we cannot require + // this for such old kernels. + // + // However, *for post-5.14 kernels*, "ino" *must* exist as a + // field, so make sure we return a SafetyViolation here if it is + // missing. Otherwise an attacker could silence this check by + // creating a "ino"-less fdinfo. + // + // [kcommit-3845f256a8b52]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3845f256a8b527127bfbd4ced21e93d9e89aa6d7 + if kernel_version::is_gte!(5, 14) { + Err(ErrorImpl::SafetyViolation { + description: format!( + r#"fd {:?} has a fake fdinfo: missing "ino" field"#, + fd.as_raw_fd(), + ) + .into(), + })?; + } + Ok(None) + } // TODO: Should we actually match for ErrorImpl::ParseIntError here? - Ok(None) | Err((ErrorKind::InternalError, _)) => Err(ErrorImpl::SafetyViolation { + Err((ErrorKind::InternalError, _)) => Err(ErrorImpl::SafetyViolation { description: format!( - r#"fd {:?} has a fake fdinfo: invalid or missing "ino" field"#, + r#"fd {:?} has a fake fdinfo: invalid "ino" field"#, fd.as_raw_fd(), ) .into(), @@ -146,14 +162,17 @@ where // Pass through any other errors. Err((_, err)) => Err(err), }?; - if actual_ino != fdinfo_ino { - Err(ErrorImpl::SafetyViolation { + // MSRV(1.85): Use let chain here (Rust 2024). + if let Some(fdinfo_ino) = fdinfo_ino { + if actual_ino != fdinfo_ino { + Err(ErrorImpl::SafetyViolation { description: format!( "fd {:?} has a fake fdinfo: wrong inode number (ino is {fdinfo_ino:X} not {actual_ino:X})", fd.as_raw_fd() ) .into(), })?; + } } // Reset the position in the fdinfo file, and re-parse it to look for @@ -169,7 +188,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::error::ErrorKind; + use crate::{error::ErrorKind, syscalls}; use std::{ fmt::Debug, @@ -528,9 +547,13 @@ mod tests { Ok(()) } - // Make sure that a missing "ino" entry also fails. + // Make sure that a missing "ino" entry also fails on new kernels. #[test] - fn fd_get_verify_fdinfo_no_ino() -> Result<(), Error> { + fn fd_get_verify_fdinfo_no_ino_kernel514() -> Result<(), Error> { + if !kernel_version::is_gte!(5, 14) { + return Ok(()); + } + const FAKE_FDINFO: &[u8] = indoc! {b" foo: abcdef mnt_id: 12345 @@ -565,6 +588,42 @@ mod tests { Ok(()) } + // Make sure that a missing "ino" entry succeeds on old kernels (emulated). + #[test] + fn fd_get_verify_fdinfo_no_ino_oldkernel() -> Result<(), Error> { + // The UNAME26 personality lets us fake a pre-5.14 kernel version to + // kernel_version::is_gte. + let _persona_guard = syscalls::scoped_personality(syscalls::PER_UNAME26); + + const FAKE_FDINFO: &[u8] = indoc! {b" + foo: abcdef + mnt_id: 12345 + "}; + + let file = File::open(".").context("open dummy file")?; + + check_fd_get_verify_fdinfo::( + &mut Cursor::new(&FAKE_FDINFO), + &file, + "mnt_id", + Ok(Some(12345)), + ) + .expect(r#"get "mnt_id" from fdinfo with missing ino (pre-5.14)"#); + + check_fd_get_verify_fdinfo::(&mut Cursor::new(&FAKE_FDINFO), &file, "ino", Ok(None)) + .expect(r#"get "ino" from fdinfo with missing ino (pre-5.14)"#); + + check_fd_get_verify_fdinfo::( + &mut Cursor::new(&FAKE_FDINFO), + &file, + "non_exist", + Ok(None), + ) + .expect(r#"get "non_exist" from fdinfo with missing ino (pre-5.14)"#); + + Ok(()) + } + // Make sure that an "ino" entry with the wrong type results in a // SafetyViolation error, not an integer parsing error. #[test]