diff --git a/crates/sandlock-core/src/netlink/handlers.rs b/crates/sandlock-core/src/netlink/handlers.rs index 49a5525..303904a 100644 --- a/crates/sandlock-core/src/netlink/handlers.rs +++ b/crates/sandlock-core/src/netlink/handlers.rs @@ -18,9 +18,27 @@ use crate::netlink::{proxy, state::NetlinkState}; use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction, OnInjectSuccess}; use crate::sys::structs::SeccompNotif; +const AF_UNIX: u64 = 1; +const AF_INET: u64 = 2; +const AF_INET6: u64 = 10; const AF_NETLINK: u64 = 16; const NETLINK_ROUTE: u64 = 0; +/// Socket families allowed to reach the kernel. Everything else returns +/// EAFNOSUPPORT — the same errno the kernel itself uses for unknown +/// families, so callers see a normal "not supported" error rather than a +/// sandbox-flavored one. +/// +/// The set is intentionally tiny: an XOA agent has no legitimate need for +/// AF_ALG, AF_PACKET, AF_VSOCK, AF_XDP, AF_TIPC, AF_RDS, AF_BLUETOOTH, and +/// the rest of the niche families that have historically yielded LPEs +/// (Copy Fail / CVE-2026-31431 via AF_ALG, Dirty Pipe-adjacent splice +/// primitives, AF_PACKET PACKET_MMAP UAFs, etc.). Closing the surface +/// once is cheaper than chasing one CVE per family. +fn family_allowed(domain: u64) -> bool { + matches!(domain, AF_UNIX | AF_INET | AF_INET6 | AF_NETLINK) +} + /// Resolve `notif.pid` (which is a TID per the kernel's `task_pid_vnr`) to /// the enclosing thread group id. fds are shared across all threads of a /// process, so cookie entries must be keyed by TGID — otherwise a cookie @@ -56,7 +74,8 @@ fn read_struct( /// Intercept `socket(AF_NETLINK, *, NETLINK_ROUTE)` and substitute one end /// of a `socketpair(AF_UNIX, SOCK_SEQPACKET)`. A tokio task takes the /// supervisor-side end and speaks synthesized NETLINK_ROUTE replies. -/// Other domains pass through; other netlink protocols are denied. +/// Allowed domains pass through; AF_NETLINK is virtualized; everything +/// else (and non-NETLINK_ROUTE netlink protocols) returns EAFNOSUPPORT. pub async fn handle_socket( notif: &SeccompNotif, state: &Arc, @@ -64,6 +83,9 @@ pub async fn handle_socket( let domain = notif.data.args[0]; let protocol = notif.data.args[2]; + if !family_allowed(domain) { + return NotifAction::Errno(libc::EAFNOSUPPORT); + } if domain != AF_NETLINK { return NotifAction::Continue; } diff --git a/crates/sandlock-core/tests/integration/test_netlink_virt.rs b/crates/sandlock-core/tests/integration/test_netlink_virt.rs index 849469c..b39cb1c 100644 --- a/crates/sandlock-core/tests/integration/test_netlink_virt.rs +++ b/crates/sandlock-core/tests/integration/test_netlink_virt.rs @@ -227,6 +227,81 @@ async fn sys_class_net_blocked() { assert!(result.success()); } +/// Regression for Copy Fail (CVE-2026-31431). The exploit's first step is +/// `socket(AF_ALG, SOCK_SEQPACKET, 0)`, then `bind()` to a sockaddr_alg +/// naming "authencesn(hmac(sha256),cbc(aes))". If `socket()` is denied +/// with EAFNOSUPPORT the page-cache corruption primitive is unreachable. +#[tokio::test] +async fn af_alg_socket_blocked() { + let out = temp_out("af-alg-blocked"); + let script = format!(concat!( + "import socket, errno\n", + "AF_ALG = 38\n", + "try:\n", + " s = socket.socket(AF_ALG, socket.SOCK_SEQPACKET, 0)\n", + " s.close()\n", + " result = 'ALLOWED'\n", + "except OSError as e:\n", + " result = f'BLOCKED:{{e.errno}}'\n", + "open('{out}', 'w').write(result)\n", + ), out = out.display()); + + let policy = base_policy().build().unwrap(); + let result = Sandbox::run_interactive(&policy, Some("test"), &["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + // EAFNOSUPPORT == 97 on Linux. We assert the exact errno so a future + // accidental switch to EPERM/EACCES (which would surface differently + // to callers) is caught. + assert_eq!( + contents, "BLOCKED:97", + "AF_ALG socket() must return EAFNOSUPPORT, got: {contents}" + ); + assert!(result.success()); +} + +/// Other niche socket families — same threat model as AF_ALG (kernel LPE +/// surface that XOA agents have no business reaching). AF_ALG has its own +/// dedicated test above; this one guards the broader class. +#[tokio::test] +async fn niche_socket_families_blocked() { + // (name, AF_* numeric value) + let families: &[(&str, i32)] = &[ + ("AF_PACKET", 17), // PACKET_MMAP has had UAFs + ("AF_VSOCK", 40), // recurring use-after-frees + ("AF_XDP", 44), + ("AF_TIPC", 30), + ]; + + for (name, af) in families { + let out = temp_out(&format!("family-blocked-{}", name)); + let script = format!(concat!( + "import socket\n", + "try:\n", + " s = socket.socket({af}, socket.SOCK_RAW, 0)\n", + " s.close()\n", + " result = 'ALLOWED'\n", + "except OSError as e:\n", + " result = f'BLOCKED:{{e.errno}}'\n", + "open('{out}', 'w').write(result)\n", + ), af = af, out = out.display()); + + let policy = base_policy().build().unwrap(); + let result = Sandbox::run_interactive(&policy, Some("test"), &["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + assert!( + contents.starts_with("BLOCKED:"), + "{name} should be blocked, got: {contents}" + ); + assert!(result.success()); + } +} + #[tokio::test] async fn non_route_netlink_still_blocked() { let out = temp_out("netlink-audit-blocked");