diff --git a/crates/sandlock-core/src/chroot/dispatch.rs b/crates/sandlock-core/src/chroot/dispatch.rs index c1aed9d..035301b 100644 --- a/crates/sandlock-core/src/chroot/dispatch.rs +++ b/crates/sandlock-core/src/chroot/dispatch.rs @@ -2,6 +2,42 @@ //! //! Intercepts path-resolving syscalls, rewrites paths via the resolve module, //! and performs on-behalf operations. Composes with COW when active. +//! +//! # Continue safety (issue #27) +//! +//! Per `seccomp_unotify(2)`, returning `Continue` lets the kernel re-read +//! user-memory pointers after the supervisor's decision, which is racy in a +//! multi-threaded child. The handlers in this module fall into four +//! categories: +//! +//! 1. **On-behalf with injected fd** (handle_chroot_open's primary path): +//! the supervisor opens via `openat2(RESOLVE_IN_ROOT)` and returns +//! `InjectFdSend` — the kernel does not re-read the path string at all. +//! TOCTOU-safe. +//! +//! 2. **On-behalf result writes** (stat/statx/readlink/getcwd/statfs): +//! the supervisor performs the underlying syscall against the +//! chroot-resolved host path and writes the result into the child's +//! output buffer. The decision returned is `ReturnValue`/`Errno`, +//! not `Continue` — TOCTOU-safe. +//! +//! 3. **Soft fall-through on read failure**: many handlers `return +//! Continue` if `read_path` or `write_child_mem` fails. The kernel's +//! own re-read will fail the same way and the syscall surfaces an +//! EFAULT/-style error to the child. No security decision was made +//! on contents we couldn't read, so this is safe. +//! +//! 4. **Path-rewrite-then-Continue** (handle_chroot_exec, handle_chroot_chdir): +//! the supervisor rewrites `path_ptr` to `/proc/self/fd/N` and returns +//! `Continue` because the kernel must execute the syscall (execve +//! replaces the address space; chdir requires the kernel's per-task +//! fs_struct update). The TOCTOU window is real here — a racing +//! sibling thread can substitute a different path string between our +//! write and the kernel's read. The bound is Landlock: a racing path +//! is still subject to `landlock_restrict_self`. See per-site comments +//! in `handle_chroot_exec` and `handle_chroot_chdir`. The planned +//! mitigation is opt-in `CLONE_THREAD` deny in the BPF filter, which +//! eliminates the racer entirely. use std::ffi::CString; use std::io::{Read, Seek, SeekFrom, Write}; @@ -1322,6 +1358,18 @@ pub(crate) async fn handle_chroot_chdir( return NotifAction::Errno(libc::EFAULT); } + // KNOWN TOCTOU LIMITATION (issue #27, same class as case 2): + // + // We've written "/proc/self/fd/N" into the child's path_ptr and + // returned Continue. The kernel will re-read path_ptr to perform + // the actual chdir. A multi-threaded child can race this read + // and substitute a different path. + // + // chdir cannot be on-behalf'd: the kernel must update the calling + // task's fs_struct (per-task cwd), which the supervisor cannot do + // for the child. The race window is bounded by Landlock — a + // racing path is still subject to landlock_restrict_self. The + // planned mitigation is opt-in CLONE_THREAD deny. NotifAction::Continue } diff --git a/crates/sandlock-core/src/cow/dispatch.rs b/crates/sandlock-core/src/cow/dispatch.rs index a2fb602..0f1f481 100644 --- a/crates/sandlock-core/src/cow/dispatch.rs +++ b/crates/sandlock-core/src/cow/dispatch.rs @@ -2,6 +2,27 @@ //! //! Reads paths from child memory, delegates to SeccompCowBranch, //! and injects results (fds, stat structs, readlink strings, dirents) back. +//! +//! # Continue safety (issue #27) +//! +//! Every `Continue` in this module is a *fall-through* — the COW layer +//! decided the syscall is outside its scope, so it lets the kernel handle +//! the original syscall normally. No COW path was modified or rewritten +//! when we return Continue, so the kernel's re-read sees exactly what the +//! child originally passed. The fall-through happens when: +//! +//! * No COW branch is active (`cow_state.branch == None`). +//! * The path doesn't match the COW prefix (`!cow.matches(path)`). +//! * `read_path` / `read_child_mem` / `CString::new` failed. +//! * The supervisor's own open/copy attempt failed and we want the +//! kernel to surface its own error. +//! +//! Because Continue means "we didn't intervene," the seccomp_unotify +//! TOCTOU concern doesn't apply: we're not making a security decision +//! whose validity depends on the kernel re-reading the same memory we +//! read. Path-based security enforcement for these fall-throughs is +//! provided by Landlock (or by the chroot dispatcher, when chroot mode +//! is active and runs before COW). use std::os::unix::io::{FromRawFd, OwnedFd, RawFd}; use std::path::{Component, Path, PathBuf}; diff --git a/crates/sandlock-core/src/netlink/handlers.rs b/crates/sandlock-core/src/netlink/handlers.rs index 701c107..49a5525 100644 --- a/crates/sandlock-core/src/netlink/handlers.rs +++ b/crates/sandlock-core/src/netlink/handlers.rs @@ -1,3 +1,16 @@ +//! Netlink virtualization handlers — interpose AF_NETLINK sockets as +//! unix socketpairs driven by a synthesized NETLINK_ROUTE responder. +//! +//! Continue safety (issue #27): every Continue here is dispatch routing +//! based on register args (socket domain, fd number) or a fall-through +//! after harmless cosmetic adjustments (recvmsg pre-zeroing). Decisions +//! that require security enforcement (non-NETLINK_ROUTE protocol) return +//! Errno; substitution returns InjectFdSendTracked. The fd-cookie check +//! (`state.is_cookie(tgid, fd)`) examines a register arg, not user memory, +//! so the seccomp_unotify TOCTOU class doesn't apply: a racing thread +//! cannot change the fd number stored in another thread's syscall +//! registers. + use std::os::unix::io::{FromRawFd, OwnedFd, RawFd}; use std::sync::Arc; diff --git a/crates/sandlock-core/src/network.rs b/crates/sandlock-core/src/network.rs index 0f20842..d8d99f4 100644 --- a/crates/sandlock-core/src/network.rs +++ b/crates/sandlock-core/src/network.rs @@ -403,9 +403,16 @@ async fn sendmsg_on_behalf( // 1. Read full msghdr struct (56 bytes on x86_64): // msg_name(8) + msg_namelen(4) + pad(4) + msg_iov(8) + msg_iovlen(8) // + msg_control(8) + msg_controllen(8) + msg_flags(4) + pad(4) + // + // If we cannot read the msghdr, fail the syscall with EFAULT instead + // of falling through to Continue. Continue would let the kernel + // re-read child memory and (for a racing thread that just remapped + // it back) potentially execute the sendmsg without the IP allowlist + // check this handler exists to enforce. EFAULT matches what the + // kernel itself would return for an unreadable msghdr pointer. let msghdr_bytes = match read_child_mem(notif_fd, notif.id, notif.pid, msghdr_ptr, 56) { Ok(b) if b.len() >= 56 => b, - _ => return NotifAction::Continue, + _ => return NotifAction::Errno(libc::EFAULT), }; let msg_name_ptr = u64::from_ne_bytes(msghdr_bytes[0..8].try_into().unwrap()); @@ -535,6 +542,23 @@ async fn sendmsg_on_behalf( /// from child memory, validates the destination, duplicates the socket via /// pidfd_getfd, and performs the syscall itself. The child's memory is never /// re-read by the kernel after validation. +/// +/// Continue safety (issue #27): the on-behalf paths don't return Continue +/// at all (they return ReturnValue/Errno after performing the syscall in +/// the supervisor). The Continue cases in this module are: +/// 1. Non-IP families (AF_UNIX etc.) — the IP allowlist doesn't apply; +/// Landlock IPC scoping is the enforcement boundary. +/// 2. Connected sockets with addr_ptr == 0 — the address was already +/// validated at connect time, so the kernel re-read of (nothing) is +/// moot. +/// 3. The fall-through case below — only reachable if the BPF filter +/// mis-routes a syscall; the kernel handles it normally. +/// In sendmsg_on_behalf, the msghdr read failure path returns +/// Errno(EFAULT) rather than Continue: a racing thread that briefly +/// unmaps the msghdr could otherwise force a fall-through that lets the +/// kernel execute sendmsg without the allowlist check. Sub-buffer read +/// failures (sockaddr/iovec/control) already return Errno(EIO) and so +/// don't bypass the check either. pub(crate) async fn handle_net( notif: &SeccompNotif, ctx: &Arc, diff --git a/crates/sandlock-core/src/port_remap.rs b/crates/sandlock-core/src/port_remap.rs index f87a8e7..33dabb3 100644 --- a/crates/sandlock-core/src/port_remap.rs +++ b/crates/sandlock-core/src/port_remap.rs @@ -3,6 +3,18 @@ // Intercepts bind and getsockname syscalls to track and remap ports. // When a sandbox binds to a port that conflicts with another sandbox, // the supervisor can transparently remap it to an available port. +// +// Continue safety (issue #27): +// - handle_bind performs the bind on-behalf via pidfd_getfd (kernel +// object, not racy user-memory string) and returns ReturnValue/Errno. +// No security decision returns Continue. +// - handle_getsockname returns Continue at the end so the kernel +// performs the real getsockname; the supervisor only translates the +// port number afterwards. The remaining Continues guard early-exit +// conditions (NULL pointers, undersized addrlen, read failures). +// None of these involve approving access based on user-memory +// contents — port-remap is naming, not authorization. Network +// allowlisting lives in network.rs and is on-behalf there. use std::collections::HashMap; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, TcpListener}; diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index 1320f35..c0da066 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -3,6 +3,20 @@ // Intercepts openat syscalls that target sensitive /proc paths or virtual // files (/proc/cpuinfo, /proc/meminfo). For virtual files, creates a memfd // with fake content and injects it into the child's fd table. +// +// Continue safety (issue #27): +// - Sensitive-path denials use Errno(EACCES) — TOCTOU-safe: the seccomp +// response *is* the answer; the kernel does not re-read user memory. +// - Virtualized paths (cpuinfo, meminfo, mounts, /proc/net/*, hostname, +// etc.) use InjectFdSend with a sealed memfd — the child's fd table +// ends up with our memfd, and the kernel never re-resolves the path +// string after injection. +// - Continue is reserved for fall-through cases: read_path failed (kernel +// will re-read and EFAULT identically), the path doesn't match any +// virtualized entry, or supervisor-side I/O on /proc//fd/ +// read_link returned an error. None of these cases involve the +// supervisor approving a syscall based on user-controlled string +// contents, so the seccomp_unotify TOCTOU class doesn't apply. use std::collections::HashSet; use std::io::{Seek, SeekFrom, Write}; diff --git a/crates/sandlock-core/src/random.rs b/crates/sandlock-core/src/random.rs index 5861a41..05988c1 100644 --- a/crates/sandlock-core/src/random.rs +++ b/crates/sandlock-core/src/random.rs @@ -1,6 +1,13 @@ // Deterministic random handler — intercepts getrandom() syscall and reads // from /dev/urandom or /dev/random, returning seeded PRNG bytes instead of // kernel-provided random bytes. +// +// Continue safety (issue #27): every `Continue` here is a fallback when +// the supervisor cannot provide deterministic bytes (memfd_create failed, +// write_child_mem failed). Falling through means the child gets real +// kernel entropy instead of our seeded stream — a determinism failure, +// not a security failure. No access-control decision is being made on +// user-memory contents, so the seccomp_unotify TOCTOU class doesn't apply. use rand::RngCore; use rand_chacha::ChaCha8Rng; diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs index 00867ca..d4277bf 100644 --- a/crates/sandlock-core/src/resource.rs +++ b/crates/sandlock-core/src/resource.rs @@ -1,4 +1,15 @@ // Resource limit handlers — memory and process limit enforcement. +// +// Continue safety (issue #27): every `Continue` in this module is safe. +// All decisions here are on scalar register args (clone flags, mmap len, +// brk address, etc.) which are copied into the seccomp_notif struct at +// notification time — they are *not* pointers into racy user memory. +// The kernel's re-read of the syscall args after Continue comes from the +// suspended calling thread's saved registers, which a sibling thread +// cannot mutate. So even though we return Continue after taking a +// security-relevant action (e.g., counting an allocation against the +// memory limit), there is no TOCTOU substitution window for the values +// we examined. use std::sync::Arc; use tokio::sync::Mutex; diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index 34e1178..737aefd 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -3,6 +3,18 @@ // Each syscall number maps to an ordered chain of handlers. The chain is walked // until a handler returns a non-Continue action (or the chain is exhausted, in // which case Continue is returned). +// +// Continue safety (issue #27): +// - The chain walker treats Continue as "this handler did not intervene, +// try the next one." A final Continue (no handler intervened, or chain +// exhausted) means the syscall passes through to the kernel as-issued. +// The kernel still enforces Landlock and the BPF filter on the +// untouched syscall, so dispatch-level Continue is not a security +// decision — it's the absence of one. +// - The conditional shim closures (random/hostname/etc_hosts opens) that +// wrap an Option-returning helper translate `None` into Continue, +// which is the same "not my path, next handler" semantics. None of +// them approve a syscall based on user-memory contents. use std::collections::HashMap; use std::future::Future; diff --git a/crates/sandlock-core/src/time.rs b/crates/sandlock-core/src/time.rs index 301dd6e..8a24a6b 100644 --- a/crates/sandlock-core/src/time.rs +++ b/crates/sandlock-core/src/time.rs @@ -36,6 +36,14 @@ pub(crate) fn calculate_time_offset(time_start: SystemTime) -> i64 { /// For absolute monotonic timers, the child computed the deadline using a /// vDSO-shifted clock (offset was added). We subtract the offset here so the /// kernel receives the correct real deadline. +/// +/// Continue safety (issue #27): every `Continue` in this function is safe. +/// This handler does virtual-time correctness, not access control — it never +/// denies a syscall based on user memory, so the seccomp_unotify TOCTOU +/// re-read does not apply. A racing thread could rewrite the timespec +/// between our adjustment and the kernel's read, but the only effect is +/// that virtual-time bookkeeping is bypassed for that one call. No +/// security boundary depends on the value we read or wrote. pub(crate) fn handle_timer( notif: &SeccompNotif, time_offset: i64,