From 71d78c20b0af1d61cca826e41fd4881276c54d43 Mon Sep 17 00:00:00 2001 From: Minh Dang Quang Date: Wed, 25 Mar 2026 21:36:01 +0700 Subject: [PATCH 01/45] fix(ci): heading-level mismatch in agent dianostic regex (#604) Signed-off-by: minhdqdev --- .github/workflows/issue-triage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-triage.yml b/.github/workflows/issue-triage.yml index 50bdd31e1..ec87af503 100644 --- a/.github/workflows/issue-triage.yml +++ b/.github/workflows/issue-triage.yml @@ -23,7 +23,7 @@ jobs: // The template placeholder starts with "Example:" — if that's still // there or the section is empty, the reporter didn't fill it in. const diagnosticMatch = body.match( - /## Agent Diagnostic\s*\n([\s\S]*?)(?=\n## |\n$)/ + /### Agent Diagnostic\s*\n([\s\S]*?)(?=\n### |\n$)/ ); const hasSubstantiveDiagnostic = diagnosticMatch From bd7b388ab6dce94d51e1e2c4b1d578d15f561d88 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Wed, 25 Mar 2026 08:20:23 -0700 Subject: [PATCH 02/45] fix(sandbox): remove double response relay in passthrough credential path (#610) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit relay_passthrough_with_credentials called relay_http_request_with_resolver (which internally relays the upstream response back to the client) and then immediately called relay_response_to_client a second time. The second call blocked forever waiting for a response that would never arrive, deadlocking every CONNECT tunnel after its first request/response pair. This caused npm install (and any HTTP/1.1 keep-alive client) to hang indefinitely when routed through the sandbox proxy without L7 rules. The L7-inspection path (relay_rest) was not affected — it correctly makes a single call to relay_http_request_with_resolver. --- crates/openshell-sandbox/src/l7/relay.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 618280475..940e7f94c 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -275,16 +275,14 @@ where "HTTP_REQUEST", ); - // Forward request with credential rewriting. - let keep_alive = + // Forward request with credential rewriting and relay the response. + // relay_http_request_with_resolver handles both directions: it sends + // the request upstream and reads the response back to the client. + let reusable = crate::l7::rest::relay_http_request_with_resolver(&req, client, upstream, resolver) .await?; - // Relay response back to client. - let reusable = - crate::l7::rest::relay_response_to_client(upstream, client, &req.action).await?; - - if !keep_alive || !reusable { + if !reusable { break; } } From 0e5ebb6f76ef7c7f4839ab3e0429167ecbdd921d Mon Sep 17 00:00:00 2001 From: Maxime Grenu <69890511+cluster2600@users.noreply.github.com> Date: Thu, 26 Mar 2026 00:20:30 +0100 Subject: [PATCH 03/45] fix(router): use max_completion_tokens for OpenAI GPT-5+ validation (#575) * fix(router): use max_completion_tokens for OpenAI GPT-5+ validation probe OpenAI GPT-5 models reject the legacy max_tokens parameter and require max_completion_tokens. The inference validation probe now sends max_completion_tokens as the primary parameter, with an automatic fallback to max_tokens when the backend returns HTTP 400 (for legacy/self-hosted backends that only support the older parameter). Closes #517 Signed-off-by: Maxime Grenu * style(router): fix cargo fmt import order and line length --------- Signed-off-by: Maxime Grenu Co-authored-by: John Myers --- crates/openshell-router/src/backend.rs | 161 ++++++++++++++++++++++- crates/openshell-server/src/inference.rs | 2 +- 2 files changed, 157 insertions(+), 6 deletions(-) diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs index 0708cfb03..d82ea082c 100644 --- a/crates/openshell-router/src/backend.rs +++ b/crates/openshell-router/src/backend.rs @@ -31,6 +31,11 @@ struct ValidationProbe { path: &'static str, protocol: &'static str, body: bytes::Bytes, + /// Alternate body to try when the primary probe fails with HTTP 400. + /// Used for OpenAI chat completions where newer models require + /// `max_completion_tokens` while legacy/self-hosted backends only + /// accept `max_tokens`. + fallback_body: Option, } /// Response from a proxied HTTP request to a backend (fully buffered). @@ -163,12 +168,17 @@ fn validation_probe(route: &ResolvedRoute) -> Result Result Result Result, + body: bytes::Bytes, +) -> Result { + let response = send_backend_request(client, route, "POST", path, headers, body) .await .map_err(|err| match err { RouterError::UpstreamUnavailable(details) => ValidationFailure { @@ -253,12 +306,12 @@ pub async fn verify_backend_endpoint( details, }, })?; - let url = build_backend_url(&route.endpoint, probe.path); + let url = build_backend_url(&route.endpoint, path); if response.status().is_success() { return Ok(ValidatedEndpoint { url, - protocol: probe.protocol.to_string(), + protocol: protocol.to_string(), }); } @@ -376,7 +429,7 @@ fn build_backend_url(endpoint: &str, path: &str) -> String { #[cfg(test)] mod tests { - use super::{build_backend_url, verify_backend_endpoint}; + use super::{ValidationFailureKind, build_backend_url, verify_backend_endpoint}; use crate::config::ResolvedRoute; use openshell_core::inference::AuthHeader; use wiremock::matchers::{body_partial_json, header, method, path}; @@ -463,4 +516,102 @@ mod tests { assert_eq!(validated.protocol, "openai_chat_completions"); assert_eq!(validated.url, "mock://test-backend/v1/chat/completions"); } + + /// GPT-5+ models reject `max_tokens` — the primary probe uses + /// `max_completion_tokens` so validation should succeed directly. + #[tokio::test] + async fn verify_openai_chat_uses_max_completion_tokens() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["openai_chat_completions"], + AuthHeader::Bearer, + ); + + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_completion_tokens": 32, + }))) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-1"})), + ) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let validated = verify_backend_endpoint(&client, &route).await.unwrap(); + + assert_eq!(validated.protocol, "openai_chat_completions"); + } + + /// Legacy/self-hosted backends that reject `max_completion_tokens` + /// should succeed on the fallback probe using `max_tokens`. + #[tokio::test] + async fn verify_openai_chat_falls_back_to_max_tokens() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["openai_chat_completions"], + AuthHeader::Bearer, + ); + + // Reject the primary probe (max_completion_tokens) with 400. + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_completion_tokens": 32, + }))) + .respond_with(ResponseTemplate::new(400).set_body_string( + r#"{"error":{"message":"Unsupported parameter: 'max_completion_tokens'"}}"#, + )) + .expect(1) + .mount(&mock_server) + .await; + + // Accept the fallback probe (max_tokens). + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_tokens": 32, + }))) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-2"})), + ) + .expect(1) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let validated = verify_backend_endpoint(&client, &route).await.unwrap(); + + assert_eq!(validated.protocol, "openai_chat_completions"); + } + + /// Non-chat-completions probes (e.g. anthropic_messages) should not + /// have a fallback — a 400 remains a hard failure. + #[tokio::test] + async fn verify_non_chat_completions_no_fallback() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["anthropic_messages"], + AuthHeader::Custom("x-api-key"), + ); + + Mock::given(method("POST")) + .and(path("/v1/messages")) + .respond_with(ResponseTemplate::new(400).set_body_string("bad request")) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let result = verify_backend_endpoint(&client, &route).await; + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().kind, + ValidationFailureKind::RequestShape + ); + } } diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 78b95944b..bbabaf70b 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -854,7 +854,7 @@ mod tests { .and(header("content-type", "application/json")) .and(body_partial_json(serde_json::json!({ "model": "gpt-4o-mini", - "max_tokens": 32, + "max_completion_tokens": 32, }))) .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ "id": "chatcmpl-123", From 6828e14642854f64b466addcec322e5d9e2d04c4 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:41:57 -0700 Subject: [PATCH 04/45] fix(sandbox): emit warning when Landlock filesystem sandbox degrades silently (#599) * fix(sandbox): emit warning when Landlock filesystem sandbox degrades silently BestEffort Landlock previously swallowed failures at debug level, making sandbox bypass invisible to operators at default log levels. Upgrade the degradation log to warn with an actionable message pointing to the hard_requirement setting. Add info-level startup log showing the requested ABI and path counts so operators always know what Landlock protections are active. Closes #584 * fix(sandbox): revert unintended ABI bump from V2 to V5 Signed-off-by: John Myers --------- Signed-off-by: John Myers Co-authored-by: John Myers --- .../src/sandbox/linux/landlock.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index 2b9873b50..e276840dd 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -10,7 +10,7 @@ use landlock::{ }; use miette::{IntoDiagnostic, Result}; use std::path::PathBuf; -use tracing::debug; +use tracing::{debug, info, warn}; pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { let read_only = policy.filesystem.read_only.clone(); @@ -29,8 +29,16 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { return Ok(()); } + let abi = ABI::V2; + info!( + abi = ?abi, + compatibility = ?policy.landlock.compatibility, + read_only_paths = read_only.len(), + read_write_paths = read_write.len(), + "Applying Landlock filesystem sandbox" + ); + let result: Result<()> = (|| { - let abi = ABI::V2; let access_all = AccessFs::from_all(abi); let access_read = AccessFs::from_read(abi); @@ -71,7 +79,11 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { policy.landlock.compatibility, LandlockCompatibility::BestEffort ) { - debug!(error = %err, "Landlock unavailable, continuing without filesystem sandbox"); + warn!( + error = %err, + "Landlock filesystem sandbox is UNAVAILABLE — running WITHOUT filesystem restrictions. \ + Set landlock.compatibility to 'hard_requirement' to make this a fatal error." + ); return Ok(()); } return Err(err); From a7ebf3a6bfe12354aef4c2d760244976e1b8db3a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 26 Mar 2026 14:58:46 +0100 Subject: [PATCH 05/45] fix(cluster): add Jetson Linux 5.15-tegra platform compatibility (#568) Three issues prevent k3s from starting on kernels where the nf_tables xt extension bridge (nft_compat) is unavailable: 1. kube-router's network policy controller uses the xt_comment iptables extension and panics on startup with "Extension comment revision 0 not supported, missing kernel module?" Pass --disable-network-policy to k3s so the controller never runs. The NSSH1 HMAC handshake remains the primary sandbox SSH isolation boundary, so this does not weaken the effective security posture. 2. flannel and kube-proxy also fail to insert rules via the nf_tables iptables backend on the same kernels. Add an xt_comment probe at cluster-entrypoint startup; if the probe fails, switch to iptables-legacy via update-alternatives before any other netfilter work so that flannel, kube-proxy, and the DNS proxy all use a consistent backend. 3. The br_netfilter kernel module must be loaded on the host for iptables rules to apply to pod bridge traffic. Without it, ClusterIP DNAT (including kube-dns at 10.43.0.10) is never applied to pod packets, causing silent DNS timeouts deep in the health-check loop. Add an early check that fails fast with an actionable error message if the module is not present, instructing the user to run `sudo modprobe br_netfilter` on the host. Signed-off-by: Evan Lezar --- .../skills/debug-openshell-cluster/SKILL.md | 2 + .../src/sandbox/linux/netns.rs | 99 +++++++++++++++++-- deploy/docker/cluster-entrypoint.sh | 70 ++++++++++++- 3 files changed, 159 insertions(+), 12 deletions(-) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 4d0e46597..5af8895cf 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -104,6 +104,7 @@ Look for: - k3s startup errors (certificate issues, port binding failures) - Manifest copy errors from `/opt/openshell/manifests/` - `iptables` or `cgroup` errors (privilege/capability issues) +- `Warning: br_netfilter does not appear to be loaded` — this is advisory only; many kernels work without the explicit module. Only act on it if you also see DNS failures or pod-to-service connectivity problems (see Common Failure Patterns). ### Step 2: Check k3s Cluster Health @@ -308,6 +309,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | Port conflict | Another service on the configured gateway host port (default 8080) | Stop conflicting service or use `--port` on `openshell gateway start` to pick a different host port | | gRPC connect refused to `127.0.0.1:443` in CI | Docker daemon is remote (`DOCKER_HOST=tcp://...`) but metadata still points to loopback | Verify metadata endpoint host matches `DOCKER_HOST` and includes non-loopback host | | DNS failures inside container | Entrypoint DNS detection failed | `openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf` and `openshell doctor logs --lines 20` | +| Pods can't reach kube-dns / ClusterIP services | `br_netfilter` not loaded; bridge traffic bypasses iptables DNAT rules | `sudo modprobe br_netfilter` on the host, then `echo br_netfilter \| sudo tee /etc/modules-load.d/br_netfilter.conf` to persist. Known to be required on Jetson Linux 5.15-tegra; other kernels (e.g. standard x86/aarch64 Linux) may have bridge netfilter built in and work without the module. The entrypoint logs a warning when `/proc/sys/net/bridge/bridge-nf-call-iptables` is absent but does not abort — only act on it if DNS or service connectivity is actually broken. | | Node DiskPressure / MemoryPressure / PIDPressure | Insufficient disk, memory, or PIDs on host | Free disk (`docker system prune -a --volumes`), increase memory, or expand host resources. Bootstrap auto-detects via `HEALTHCHECK_NODE_PRESSURE` marker | | Pods evicted with "The node had condition: [DiskPressure]" | Host disk full, kubelet evicting pods | Free disk space on host, then `openshell gateway destroy && openshell gateway start` | | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component | diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-sandbox/src/sandbox/linux/netns.rs index 5e6907c53..095ed86c4 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/netns.rs @@ -262,15 +262,18 @@ impl NetworkNamespace { info!( namespace = %self.name, - iptables = iptables_path, + iptables = %iptables_path, proxy_addr = %format!("{}:{}", host_ip_str, proxy_port), "Installing bypass detection rules" ); // Install IPv4 rules - if let Err(e) = - self.install_bypass_rules_for(iptables_path, &host_ip_str, &proxy_port_str, &log_prefix) - { + if let Err(e) = self.install_bypass_rules_for( + &iptables_path, + &host_ip_str, + &proxy_port_str, + &log_prefix, + ) { warn!( namespace = %self.name, error = %e, @@ -281,7 +284,7 @@ impl NetworkNamespace { // Install IPv6 rules — best-effort. // Skip the proxy ACCEPT rule for IPv6 since the proxy address is IPv4. - if let Some(ip6_path) = find_ip6tables(iptables_path) { + if let Some(ip6_path) = find_ip6tables(&iptables_path) { if let Err(e) = self.install_bypass_rules_for_v6(&ip6_path, &log_prefix) { warn!( namespace = %self.name, @@ -666,12 +669,92 @@ fn run_iptables_netns(netns: &str, iptables_cmd: &str, args: &[&str]) -> Result< const IPTABLES_SEARCH_PATHS: &[&str] = &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"]; +/// Returns true if xt extension modules (e.g. xt_comment) cannot be used +/// via the given iptables binary. +/// +/// Some kernels have nf_tables but lack the nft_compat bridge that allows +/// xt extension modules to be used through the nf_tables path (e.g. Jetson +/// Linux 5.15-tegra). This probe detects that condition by attempting to +/// insert a rule using the xt_comment extension. If it fails, xt extensions +/// are unavailable and the caller should fall back to iptables-legacy. +fn xt_extensions_unavailable(iptables_path: &str) -> bool { + // Create a temporary probe chain. If this fails (e.g. no CAP_NET_ADMIN), + // we can't determine availability — assume extensions are available. + let created = Command::new(iptables_path) + .args(["-t", "filter", "-N", "_xt_probe"]) + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + + if !created { + return false; + } + + // Attempt to insert a rule using xt_comment. Failure means nft_compat + // cannot bridge xt extension modules on this kernel. + let probe_ok = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-A", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + + // Clean up — best-effort, ignore failures. + let _ = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-D", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output(); + let _ = Command::new(iptables_path) + .args(["-t", "filter", "-X", "_xt_probe"]) + .output(); + + !probe_ok +} + /// Find the iptables binary path, checking well-known locations. -fn find_iptables() -> Option<&'static str> { - IPTABLES_SEARCH_PATHS +/// +/// If xt extension modules are unavailable via the standard binary and +/// `iptables-legacy` is available alongside it, the legacy binary is returned +/// instead. This ensures bypass-detection rules can be installed on kernels +/// where `nft_compat` is unavailable (e.g. Jetson Linux 5.15-tegra). +fn find_iptables() -> Option { + let standard_path = IPTABLES_SEARCH_PATHS .iter() .find(|path| std::path::Path::new(path).exists()) - .copied() + .copied()?; + + if xt_extensions_unavailable(standard_path) { + let legacy_path = standard_path.replace("iptables", "iptables-legacy"); + if std::path::Path::new(&legacy_path).exists() { + debug!( + legacy = legacy_path, + "xt extensions unavailable; using iptables-legacy" + ); + return Some(legacy_path); + } + } + + Some(standard_path.to_string()) } /// Find the ip6tables binary path, deriving it from the iptables location. diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 84b8cf9ac..2fea6fa61 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -25,6 +25,61 @@ set -e +# --------------------------------------------------------------------------- +# Select iptables backend +# --------------------------------------------------------------------------- +# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem +# but lack the nft_compat bridge that allows flannel and kube-proxy to use +# xt extension modules (xt_comment, xt_conntrack). Detect this by probing +# whether xt_comment is usable via the current iptables backend. If the +# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1 +# externally to skip the probe and force the legacy backend. +# --------------------------------------------------------------------------- +# Check br_netfilter kernel module +# --------------------------------------------------------------------------- +# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through +# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are +# never applied to pod traffic, so pods cannot reach services such as +# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution. +# +# The module must be loaded on the HOST before the container starts — +# containers cannot load kernel modules themselves. If it is missing, log a +# warning rather than failing hard: some kernels have bridge netfilter support +# built-in or expose it differently, and will work correctly without the module +# being explicitly loaded as a separate .ko. +if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo "Warning: br_netfilter does not appear to be loaded on the host." >&2 + echo " Pod-to-service networking (including kube-dns) may not work without it." >&2 + echo " If the cluster fails to start or DNS is broken, try loading it on the host:" >&2 + echo " sudo modprobe br_netfilter" >&2 + echo " To persist across reboots:" >&2 + echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2 +fi + +if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then + if iptables -t filter -N _xt_probe 2>/dev/null; then + _probe_rc=0 + iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || _probe_rc=$? + iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || true + iptables -t filter -X _xt_probe 2>/dev/null || true + [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1 + fi +fi + +if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then + echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy" + if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && \ + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then + echo "Now using iptables-legacy mode" + else + echo "Warning: could not switch to iptables-legacy — cluster networking may fail" + fi +fi + +IPTABLES=$([ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && echo iptables-legacy || echo iptables) + RESOLV_CONF="/etc/rancher/k3s/resolv.conf" has_default_route() { @@ -74,11 +129,11 @@ setup_dns_proxy() { # Docker sets up rules like: # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p udp --dport 53 -j DNAT --to-destination 127.0.0.11: # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p tcp --dport 53 -j DNAT --to-destination 127.0.0.11: - UDP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \ + UDP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ | grep -- '-p udp.*--dport 53' \ | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ | head -1) - TCP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \ + TCP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ | grep -- '-p tcp.*--dport 53' \ | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ | head -1) @@ -101,9 +156,9 @@ setup_dns_proxy() { echo "Setting up DNS proxy: ${CONTAINER_IP}:53 -> 127.0.0.11 (udp:${UDP_PORT}, tcp:${TCP_PORT})" # Forward DNS from pods (PREROUTING) and local processes (OUTPUT) to Docker's DNS - iptables -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \ + $IPTABLES -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \ --to-destination "127.0.0.11:${UDP_PORT}" - iptables -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \ + $IPTABLES -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \ --to-destination "127.0.0.11:${TCP_PORT}" echo "nameserver $CONTAINER_IP" > "$RESOLV_CONF" @@ -495,6 +550,13 @@ if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then EXTRA_KUBELET_ARGS="--kubelet-arg=fail-cgroupv1=false" fi +# On kernels where xt_comment is unavailable, kube-router's network policy +# controller panics at startup. Disable it when the iptables-legacy probe +# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead. +if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then + EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy" +fi + # Docker Desktop can briefly start the container before its bridge default route # is fully installed. k3s exits immediately in that state, so wait briefly for # routing to settle first. From 0ac1fbd21b50f3aeaf7160223bf6ea5f6061a144 Mon Sep 17 00:00:00 2001 From: LateNightHackathon <256481314+latenighthackathon@users.noreply.github.com> Date: Sun, 29 Mar 2026 18:24:08 -0500 Subject: [PATCH 06/45] fix(l7): reject duplicate Content-Length headers to prevent request smuggling (CWE-444) (#663) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(l7): reject duplicate Content-Length headers to prevent request smuggling Both parse_body_length() in rest.rs and try_parse_http_request() in inference.rs silently accepted multiple Content-Length headers, overwriting with the last value seen. Per RFC 7230 Section 3.3.3, a message with multiple Content-Length headers with differing values must be rejected to prevent HTTP request smuggling (CWE-444). An attacker could send conflicting Content-Length values causing the proxy and downstream server to disagree on message boundaries. Fix: - rest.rs: detect duplicate CL headers with differing values and return an error before forwarding - inference.rs: add ParseResult::Invalid variant; detect duplicate CL headers and return Invalid with a descriptive reason - proxy.rs: handle ParseResult::Invalid by sending HTTP 400 and denying the connection Closes #637 Signed-off-by: latenighthackathon * fix(l7): address review feedback on Content-Length smuggling defense - inference.rs: reject unparseable Content-Length values instead of silently defaulting to 0 via unwrap_or(0) - rest.rs: reject unparseable Content-Length values so a valid+invalid duplicate pair cannot bypass the differing-values check - rest.rs: fix Transfer-Encoding substring match (.contains("chunked") → split/trim exact match) to align with inference.rs and prevent false positives on values like "chunkedx" - proxy.rs: log parsing details server-side via tracing::warn and return generic "Bad Request" body instead of leaking internal parsing reasons to sandboxed code - Add tests for all new rejection paths in inference.rs and rest.rs Signed-off-by: latenighthackathon * style(l7): apply cargo fmt formatting Signed-off-by: latenighthackathon --------- Signed-off-by: latenighthackathon Co-authored-by: latenighthackathon --- crates/openshell-sandbox/src/l7/inference.rs | 58 +++++++++++++++- crates/openshell-sandbox/src/l7/rest.rs | 71 ++++++++++++++++++-- crates/openshell-sandbox/src/proxy.rs | 6 ++ 3 files changed, 129 insertions(+), 6 deletions(-) diff --git a/crates/openshell-sandbox/src/l7/inference.rs b/crates/openshell-sandbox/src/l7/inference.rs index 59dafdaba..140213f07 100644 --- a/crates/openshell-sandbox/src/l7/inference.rs +++ b/crates/openshell-sandbox/src/l7/inference.rs @@ -96,6 +96,8 @@ pub enum ParseResult { Complete(ParsedHttpRequest, usize), /// Headers are incomplete — caller should read more data. Incomplete, + /// The request is malformed and must be rejected (e.g., duplicate Content-Length). + Invalid(String), } /// Try to parse an HTTP/1.1 request from raw bytes. @@ -125,6 +127,7 @@ pub fn try_parse_http_request(buf: &[u8]) -> ParseResult { let mut headers = Vec::new(); let mut content_length: usize = 0; + let mut has_content_length = false; let mut is_chunked = false; for line in lines { if line.is_empty() { @@ -134,7 +137,21 @@ pub fn try_parse_http_request(buf: &[u8]) -> ParseResult { let name = name.trim().to_string(); let value = value.trim().to_string(); if name.eq_ignore_ascii_case("content-length") { - content_length = value.parse().unwrap_or(0); + let new_len: usize = match value.parse() { + Ok(v) => v, + Err(_) => { + return ParseResult::Invalid(format!( + "invalid Content-Length value: {value}" + )); + } + }; + if has_content_length && new_len != content_length { + return ParseResult::Invalid(format!( + "duplicate Content-Length headers with differing values ({content_length} vs {new_len})" + )); + } + content_length = new_len; + has_content_length = true; } if name.eq_ignore_ascii_case("transfer-encoding") && value @@ -552,4 +569,43 @@ mod tests { }; assert_eq!(parsed.body.len(), 100); } + + // ---- SEC: Content-Length validation ---- + + #[test] + fn reject_differing_duplicate_content_length() { + let request = b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nContent-Length: 0\r\nContent-Length: 50\r\n\r\n"; + assert!(matches!( + try_parse_http_request(request), + ParseResult::Invalid(reason) if reason.contains("differing values") + )); + } + + #[test] + fn accept_identical_duplicate_content_length() { + let request = b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nContent-Length: 5\r\nContent-Length: 5\r\n\r\nhello"; + let ParseResult::Complete(parsed, _) = try_parse_http_request(request) else { + panic!("expected Complete for identical duplicate CL"); + }; + assert_eq!(parsed.body, b"hello"); + } + + #[test] + fn reject_non_numeric_content_length() { + let request = + b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nContent-Length: abc\r\n\r\n"; + assert!(matches!( + try_parse_http_request(request), + ParseResult::Invalid(reason) if reason.contains("invalid Content-Length") + )); + } + + #[test] + fn reject_two_non_numeric_content_lengths() { + let request = b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nContent-Length: abc\r\nContent-Length: def\r\n\r\n"; + assert!(matches!( + try_parse_http_request(request), + ParseResult::Invalid(_) + )); + } } diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index ebb349578..f47f01bdc 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -242,14 +242,22 @@ fn parse_body_length(headers: &str) -> Result { let lower = line.to_ascii_lowercase(); if lower.starts_with("transfer-encoding:") { let val = lower.split_once(':').map_or("", |(_, v)| v.trim()); - if val.contains("chunked") { + if val.split(',').any(|enc| enc.trim() == "chunked") { has_te_chunked = true; } } - if lower.starts_with("content-length:") - && let Some(val) = lower.split_once(':').map(|(_, v)| v.trim()) - && let Ok(len) = val.parse::() - { + if lower.starts_with("content-length:") { + let val = lower.split_once(':').map_or("", |(_, v)| v.trim()); + let len: u64 = val + .parse() + .map_err(|_| miette!("Request contains invalid Content-Length value"))?; + if let Some(prev) = cl_value { + if prev != len { + return Err(miette!( + "Request contains multiple Content-Length headers with differing values ({prev} vs {len})" + )); + } + } cl_value = Some(len); } } @@ -702,6 +710,59 @@ mod tests { ); } + /// SEC: Reject differing duplicate Content-Length headers. + #[test] + fn reject_differing_duplicate_content_length() { + let headers = + "POST /api HTTP/1.1\r\nHost: x\r\nContent-Length: 0\r\nContent-Length: 50\r\n\r\n"; + assert!( + parse_body_length(headers).is_err(), + "Must reject differing duplicate Content-Length" + ); + } + + /// SEC: Accept identical duplicate Content-Length headers. + #[test] + fn accept_identical_duplicate_content_length() { + let headers = + "POST /api HTTP/1.1\r\nHost: x\r\nContent-Length: 42\r\nContent-Length: 42\r\n\r\n"; + match parse_body_length(headers).unwrap() { + BodyLength::ContentLength(42) => {} + other => panic!("Expected ContentLength(42), got {other:?}"), + } + } + + /// SEC: Reject non-numeric Content-Length values. + #[test] + fn reject_non_numeric_content_length() { + let headers = "POST /api HTTP/1.1\r\nHost: x\r\nContent-Length: abc\r\n\r\n"; + assert!( + parse_body_length(headers).is_err(), + "Must reject non-numeric Content-Length" + ); + } + + /// SEC: Reject when second Content-Length is non-numeric (bypass test). + #[test] + fn reject_valid_then_invalid_content_length() { + let headers = + "POST /api HTTP/1.1\r\nHost: x\r\nContent-Length: 42\r\nContent-Length: abc\r\n\r\n"; + assert!( + parse_body_length(headers).is_err(), + "Must reject when any Content-Length is non-numeric" + ); + } + + /// SEC: Transfer-Encoding substring match must not match partial tokens. + #[test] + fn te_substring_not_chunked() { + let headers = "POST /api HTTP/1.1\r\nHost: x\r\nTransfer-Encoding: chunkedx\r\n\r\n"; + match parse_body_length(headers).unwrap() { + BodyLength::None => {} + other => panic!("Expected None for non-matching TE, got {other:?}"), + } + } + /// SEC-009: Bare LF in headers enables header injection. #[tokio::test] async fn reject_bare_lf_in_headers() { diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 1f38a2cca..6f3eaf9bd 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -943,6 +943,12 @@ async fn handle_inference_interception( buf.resize((buf.len() * 2).min(MAX_INFERENCE_BUF), 0); } } + ParseResult::Invalid(reason) => { + warn!(reason = %reason, "rejecting malformed inference request"); + let response = format_http_response(400, &[], b"Bad Request"); + write_all(&mut tls_client, &response).await?; + return Ok(InferenceOutcome::Denied { reason }); + } } } From 94fbb643b3da25e92afee1bd34c529969b5009b1 Mon Sep 17 00:00:00 2001 From: LateNightHackathon <256481314+latenighthackathon@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:24:38 -0500 Subject: [PATCH 07/45] fix(proxy): add L7 inspection to forward proxy path (#666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor(l7): export evaluate_l7_request for cross-module use Make evaluate_l7_request() public so the forward proxy path can evaluate individual requests against L7 policy without going through the full relay_with_inspection() loop. Signed-off-by: latenighthackathon * fix(proxy): add L7 inspection to forward proxy path The forward proxy previously rejected all requests to endpoints with L7 rules (blanket 403), forcing clients through the CONNECT tunnel. This meant policies like read-only (allow GET, block POST) had no effect on plain http:// requests through the forward proxy. Replace the blanket rejection with actual L7 evaluation: - Query L7 config for the endpoint (same as before) - Clone the OPA engine and evaluate the request method/path - Allow if L7 policy permits, deny with 403 if enforcement is enforce - Audit mode: log but allow (matching CONNECT path behavior) - Fail-closed: deny on evaluation errors The forward proxy uses Connection: close (one request per connection), so a single evaluation suffices — no relay loop needed. Update e2e tests to validate the new behavior: - GET /allowed → 200 (L7 policy allows) - POST /allowed → 403 (L7 policy denies, enforcement: enforce) Update security-policy.md to reflect the new forward proxy L7 behavior. Closes #643 Signed-off-by: latenighthackathon * style(proxy): apply cargo fmt formatting Signed-off-by: latenighthackathon --------- Signed-off-by: latenighthackathon Co-authored-by: latenighthackathon --- architecture/security-policy.md | 2 +- crates/openshell-sandbox/src/l7/relay.rs | 2 +- crates/openshell-sandbox/src/proxy.rs | 95 ++++++++++++++++++----- e2e/rust/tests/forward_proxy_l7_bypass.rs | 76 ++++++++++++++---- 4 files changed, 141 insertions(+), 34 deletions(-) diff --git a/architecture/security-policy.md b/architecture/security-policy.md index 44898d70b..8c41e5b91 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -716,7 +716,7 @@ If any condition fails, the proxy returns `403 Forbidden`. 7. Rewrites the request: absolute-form → origin-form (`GET /path HTTP/1.1`), strips hop-by-hop headers, adds `Via: 1.1 openshell-sandbox` and `Connection: close` 8. Forwards the rewritten request, then relays bidirectionally using `tokio::io::copy_bidirectional` (supports chunked transfer, SSE streams, and other long-lived responses with no idle timeout) -**V1 simplifications**: Forward proxy v1 injects `Connection: close` (no keep-alive) and does not perform L7 inspection on the forwarded traffic. Every forward proxy connection handles exactly one request-response exchange. +**V1 simplifications**: Forward proxy v1 injects `Connection: close` (no keep-alive). Every forward proxy connection handles exactly one request-response exchange. When an endpoint has L7 rules configured, the forward proxy evaluates the single request's method and path against L7 policy before forwarding. **Implementation**: See `crates/openshell-sandbox/src/proxy.rs` -- `handle_forward_proxy()`, `parse_proxy_uri()`, `rewrite_forward_request()`. diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 940e7f94c..88f9ce8a6 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -180,7 +180,7 @@ fn is_benign_connection_error(err: &miette::Report) -> bool { /// Evaluate an L7 request against the OPA engine. /// /// Returns `(allowed, deny_reason)`. -fn evaluate_l7_request( +pub fn evaluate_l7_request( engine: &Mutex, ctx: &L7EvalContext, request: &L7RequestInfo, diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 6f3eaf9bd..088ec46a6 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1803,10 +1803,62 @@ async fn handle_forward_proxy( }; let policy_str = matched_policy.as_deref().unwrap_or("-"); - // 4b. Reject if the endpoint has L7 config — the forward proxy path does - // not perform per-request method/path inspection, so L7-configured - // endpoints must go through the CONNECT tunnel where inspection happens. - if query_l7_config(&opa_engine, &decision, &host_lc, port).is_some() { + // 4b. If the endpoint has L7 config, evaluate the request against + // L7 policy. The forward proxy handles exactly one request per + // connection (Connection: close), so a single evaluation suffices. + if let Some(l7_config) = query_l7_config(&opa_engine, &decision, &host_lc, port) { + let tunnel_engine = opa_engine.clone_engine_for_tunnel().unwrap_or_else(|e| { + warn!( + error = %e, + "Failed to clone OPA engine for forward L7" + ); + regorus::Engine::new() + }); + let engine_mutex = std::sync::Mutex::new(tunnel_engine); + + let l7_ctx = crate::l7::relay::L7EvalContext { + host: host_lc.clone(), + port, + policy_name: matched_policy.clone().unwrap_or_default(), + binary_path: decision + .binary + .as_ref() + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or_default(), + ancestors: decision + .ancestors + .iter() + .map(|p| p.to_string_lossy().into_owned()) + .collect(), + cmdline_paths: decision + .cmdline_paths + .iter() + .map(|p| p.to_string_lossy().into_owned()) + .collect(), + secret_resolver: secret_resolver.clone(), + }; + + let request_info = crate::l7::L7RequestInfo { + action: method.to_string(), + target: path.clone(), + }; + + let (allowed, reason) = + crate::l7::relay::evaluate_l7_request(&engine_mutex, &l7_ctx, &request_info) + .unwrap_or_else(|e| { + warn!( + error = %e, + "L7 eval failed, denying request" + ); + (false, format!("L7 evaluation error: {e}")) + }); + + let decision_str = match (allowed, l7_config.enforcement) { + (true, _) => "allow", + (false, crate::l7::EnforcementMode::Audit) => "audit", + (false, crate::l7::EnforcementMode::Enforce) => "deny", + }; + info!( dst_host = %host_lc, dst_port = port, @@ -1814,21 +1866,28 @@ async fn handle_forward_proxy( path = %path, binary = %binary_str, policy = %policy_str, - action = "deny", - reason = "endpoint has L7 rules; use CONNECT", - "FORWARD", + l7_protocol = "rest", + l7_decision = decision_str, + l7_deny_reason = %reason, + "FORWARD_L7", ); - emit_denial_simple( - denial_tx, - &host_lc, - port, - &binary_str, - &decision, - "endpoint has L7 rules configured; forward proxy bypasses L7 inspection — use CONNECT", - "forward-l7-bypass", - ); - respond(client, b"HTTP/1.1 403 Forbidden\r\n\r\n").await?; - return Ok(()); + + let effectively_denied = + !allowed && l7_config.enforcement == crate::l7::EnforcementMode::Enforce; + + if effectively_denied { + emit_denial_simple( + denial_tx, + &host_lc, + port, + &binary_str, + &decision, + &reason, + "forward-l7-deny", + ); + respond(client, b"HTTP/1.1 403 Forbidden\r\n\r\n").await?; + return Ok(()); + } } // 5. DNS resolution + SSRF defence (mirrors the CONNECT path logic). diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index fb75176cc..89ecf2cd4 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -1,10 +1,10 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Regression test: the forward proxy path must reject requests to endpoints -//! that have L7 rules configured. Before the fix, plain `http://` requests -//! (which use the forward proxy, not CONNECT) bypassed per-request method/path -//! enforcement entirely. +//! Regression tests: the forward proxy path must evaluate L7 rules for +//! endpoints that have them configured. Allowed requests (e.g. GET on a +//! read-only endpoint) should succeed; denied requests (e.g. POST) should +//! receive 403. #![cfg(feature = "e2e")] @@ -145,6 +145,7 @@ network_policies: - host: host.openshell.internal port: {port} protocol: rest + enforcement: enforce allowed_ips: - "172.0.0.0/8" rules: @@ -164,24 +165,21 @@ network_policies: Ok(file) } -/// The forward proxy path (plain http:// via HTTP_PROXY) must return 403 for -/// endpoints with L7 rules, forcing clients through the CONNECT tunnel where -/// per-request method/path inspection actually happens. +/// GET /allowed should succeed — the L7 policy explicitly allows it. #[tokio::test] -async fn forward_proxy_rejects_l7_configured_endpoint() { +async fn forward_proxy_allows_l7_permitted_request() { let server = DockerServer::start() .await .expect("start docker test server"); - let policy = write_policy_with_l7_rules(server.port).expect("write custom policy"); + let policy = + write_policy_with_l7_rules(server.port) + .expect("write custom policy"); let policy_path = policy .path() .to_str() .expect("temp policy path should be utf-8") .to_string(); - // Python script that tries a plain http:// request (forward proxy path). - // HTTP_PROXY is set automatically by the sandbox, so urllib will use the - // forward proxy for http:// URLs (not CONNECT). let script = format!( r#" import urllib.request, urllib.error, json, sys @@ -208,10 +206,60 @@ except Exception as e: .await .expect("sandbox create"); - // The forward proxy should return 403 because the endpoint has L7 rules. + // L7 policy allows GET /allowed — should succeed. + assert!( + guard.create_output.contains("\"status\": 200"), + "expected 200 for L7-allowed GET, got:\n{}", + guard.create_output + ); +} + +/// POST /allowed should be denied — the L7 policy only allows GET. +#[tokio::test] +async fn forward_proxy_denies_l7_blocked_request() { + let server = DockerServer::start() + .await + .expect("start docker test server"); + let policy = + write_policy_with_l7_rules(server.port) + .expect("write custom policy"); + let policy_path = policy + .path() + .to_str() + .expect("temp policy path should be utf-8") + .to_string(); + + let script = format!( + r#" +import urllib.request, urllib.error, json, sys +url = "http://host.openshell.internal:{port}/allowed" +req = urllib.request.Request(url, data=b"test", method="POST") +try: + resp = urllib.request.urlopen(req, timeout=15) + print(json.dumps({{"status": resp.status, "error": None}})) +except urllib.error.HTTPError as e: + print(json.dumps({{"status": e.code, "error": str(e)}})) +except Exception as e: + print(json.dumps({{"status": -1, "error": str(e)}})) +"#, + port = server.port, + ); + + let guard = SandboxGuard::create(&[ + "--policy", + &policy_path, + "--", + "python3", + "-c", + &script, + ]) + .await + .expect("sandbox create"); + + // L7 policy denies POST — should return 403. assert!( guard.create_output.contains("\"status\": 403"), - "expected 403 from forward proxy for L7-configured endpoint, got:\n{}", + "expected 403 for L7-denied POST, got:\n{}", guard.create_output ); } From a69ef060349bb5c59ee0f40a8b95d17913e05641 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Mon, 30 Mar 2026 09:43:42 -0700 Subject: [PATCH 08/45] fix(ci): skip docs preview deploy for fork PRs (#679) Fork PRs receive a read-only GITHUB_TOKEN that cannot push to gh-pages, causing the deploy step to fail with a 403. Skip the deploy for fork PRs while still running the docs build to validate changes compile cleanly. --- .github/workflows/docs-preview-pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docs-preview-pr.yml b/.github/workflows/docs-preview-pr.yml index ccade7411..6c0672ba2 100644 --- a/.github/workflows/docs-preview-pr.yml +++ b/.github/workflows/docs-preview-pr.yml @@ -49,6 +49,7 @@ jobs: find _build -name .buildinfo -exec rm {} \; - name: Deploy preview + if: github.event.pull_request.head.repo.full_name == github.repository uses: rossjrw/pr-preview-action@v1 with: source-dir: ./_build/docs/ From c1dd81e5d4f8891a945810b6313b50ac34e53046 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 30 Mar 2026 10:00:04 -0700 Subject: [PATCH 09/45] docs(rfc): add RFC process with draft/review/accepted lifecycle (#678) --- CONTRIBUTING.md | 5 ++ rfc/0000-template.md | 49 +++++++++++++++ rfc/README.md | 140 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+) create mode 100644 rfc/0000-template.md create mode 100644 rfc/README.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c25d30b92..1ebf71df2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -186,9 +186,14 @@ These are the primary `mise` tasks for day-to-day development: | `tasks/` | `mise` task definitions and build scripts | | `deploy/` | Dockerfiles, Helm chart, Kubernetes manifests | | `architecture/` | Architecture docs and plans | +| `rfc/` | Request for Comments proposals | | `docs/` | User-facing documentation (Sphinx/MyST) | | `.agents/` | Agent skills and persona definitions | +## RFCs + +For cross-cutting architectural decisions, API contract changes, or process proposals that need broad consensus, use the RFC process. RFCs live in `rfc/` — copy the template, fill it in, and open a PR for discussion. See [rfc/README.md](rfc/README.md) for the full lifecycle and guidelines on when to write an RFC versus a spike issue or architecture doc. + ## Documentation If your change affects user-facing behavior (new flags, changed defaults, new features, bug fixes that contradict existing docs), update the relevant pages under `docs/` in the same PR. diff --git a/rfc/0000-template.md b/rfc/0000-template.md new file mode 100644 index 000000000..ec66ca967 --- /dev/null +++ b/rfc/0000-template.md @@ -0,0 +1,49 @@ +--- +authors: + - "@your-github-username" +state: draft +links: + - (related PRs, discussions, or issues) +--- + +# RFC NNNN - Your Title Here + + + +## Summary + +One paragraph explanation of the feature or change. + +## Motivation + +Why are we doing this? What problem does it solve? What use cases does it support? + +## Non-goals + +What is explicitly out of scope for this RFC? + +## Proposal + +What are we actually proposing? Explain the design in enough detail that someone familiar with OpenShell can understand and implement it. + +## Implementation plan + +How do we get from here to there? Consider rollout strategy, backwards compatibility, and migration. + +## Risks + +Why should we *not* do this? What are the costs? + +## Alternatives + +What other designs have been considered? What is the impact of not doing this? + +## Prior art + +Does this feature exist in other projects? What can we learn from them? + +## Open questions + +What parts of the design are still TBD? diff --git a/rfc/README.md b/rfc/README.md new file mode 100644 index 000000000..b409cad00 --- /dev/null +++ b/rfc/README.md @@ -0,0 +1,140 @@ +# OpenShell RFCs + +Substantial changes to OpenShell should be proposed in writing before implementation begins. An RFC provides a consistent way to propose an idea, collect feedback from the community, build consensus, and document the decision for future contributors. Not every change needs an RFC — bug fixes, small features, and routine maintenance go through normal pull requests. RFCs are for the changes that are cross-cutting, potentially controversial, or significant enough that stakeholders should weigh in before code is written. + +## Start with a GitHub Discussion + +Before writing an RFC, consider opening a [GitHub Discussion](https://github.com/NVIDIA/OpenShell/discussions) to gauge interest and get early feedback. This helps: + +- Validate that the problem is worth solving +- Surface potential concerns early +- Build consensus before investing in a detailed proposal +- Identify the right reviewers and stakeholders + +If the discussion shows sufficient interest and the idea has merit, then it's time to write an RFC to detail the plan and technical approach. + +## RFCs vs other artifacts + +OpenShell has several places where design information lives. Use this guide to pick the right one: + +| Artifact | Purpose | When to use | +|----------|---------|-------------| +| **GitHub Discussion** | Gauge interest in a rough idea | You have a thought but aren't sure it's worth a proposal yet | +| **Spike issue** (`create-spike`) | Investigate implementation feasibility for a scoped change | You need to explore the codebase and produce a buildable issue for a specific component or feature | +| **RFC** | Propose a cross-cutting decision that needs broad consensus | Architectural changes, API contracts, process changes, or anything that spans multiple components or teams | +| **Architecture doc** (`architecture/`) | Document how things work today | Living reference material — updated as the system evolves | + +The key distinction: **spikes investigate whether and how something can be done; RFCs propose that we should do it and seek agreement on the approach.** A spike may precede an RFC (to gather data) or follow one (to flesh out implementation details). When an RFC reaches `implemented`, its relevant content should be folded into the appropriate `architecture/` docs so the living reference stays current. + +## When to use an RFC + +The following are examples of when an RFC is appropriate: + +- An architectural or design decision for the platform +- Change to an API or command-line tool +- Change to an internal API or tool +- Add or change a company or team process +- A design for testing + +RFCs don't only apply to technical ideas but overall project ideas and processes as well. If you have an idea to improve the way something is being done, you have the power to make your voice heard. + +## When NOT to use an RFC + +Not everything needs an RFC. Skip the RFC process for: + +- Bug fixes +- Small feature additions scoped to a single component (use a spike instead) +- Documentation changes +- Dependency updates +- Refactoring that doesn't change public interfaces + +If a change doesn't require cross-team consensus, a spike issue is the right vehicle. + +## RFC metadata and state + +At the start of every RFC document, we include a brief amount of metadata in YAML front matter: + +```yaml +--- +authors: + - "@username" +state: draft +links: + - https://github.com/NVIDIA/OpenShell/pull/123 + - https://github.com/NVIDIA/OpenShell/discussions/456 +--- +``` + +We track the following metadata: + +- **authors**: The authors (and therefore owners) of an RFC. Listed as GitHub usernames. +- **state**: Must be one of the states discussed below. +- **links**: Related PRs, discussions, or issues. Add entries as the RFC progresses. +- **superseded_by**: *(optional)* For RFCs in the `superseded` state, the RFC number that replaces this one (e.g., `0005`). + +An RFC can be in one of the following states: + +| State | Description | +|-------|-------------| +| `draft` | The RFC is being written and is not yet ready for formal review. | +| `review` | Under active discussion in a pull request. | +| `accepted` | The proposal has been accepted and is ready for implementation. | +| `rejected` | The proposal was reviewed and declined. | +| `implemented` | The idea has been entirely implemented. Changes would be infrequent. | +| `superseded` | Replaced by a newer RFC. The `superseded_by` field should reference the replacement. | + +## RFC lifecycle + +### 1. Reserve an RFC number + +Look at the existing RFCs in this directory and choose the next available number. If two authors happen to pick the same number on separate branches, the conflict is resolved during PR review — the later PR simply picks the next available number. + +### 2. Create your RFC + +Copy `0000-template.md` to `NNNN-my-feature.md` where `NNNN` is your RFC number (zero-padded to 4 digits) and `my-feature` is a short descriptive name. + +Fill in the metadata and start writing. The state should be `draft` while you're iterating. + +### 3. Open a pull request + +When you're ready for feedback, update the state to `review` and open a pull request. Add the PR link to the `pr` field in the metadata. + +The PR is where discussion happens. Anyone subscribed to the repo will get a notification and can read your RFC and provide feedback. + +### 4. Iterate and build consensus + +The comments you choose to accept are up to you as the owner of the RFC, but you should remain empathetic in how you engage. For those giving feedback, be sure that all feedback is constructive. + +RFCs rarely go through this process unchanged. Make edits as new commits to the PR and leave comments explaining your changes. + +### 5. Merge the pull request + +After there has been time for folks to comment, the RFC author requests merge and a maintainer approves and merges. The state should be updated from `review` to `accepted`. If the proposal is declined, the state should be set to `rejected`. The timing is left to the author's discretion. As a guideline, a few business days seems reasonable, but circumstances may dictate a different timeline. + +In general, RFCs shouldn't be merged if no one else has read or commented on it. If no one is reading your RFC, it's time to explicitly ask someone to give it a read! + +### 6. Implementation + +Once an RFC has been entirely implemented, its state should be moved to `implemented`. This represents ideas that have been fully developed. While discussion on implemented RFCs is permitted, changes would be expected to be infrequent. + +## Diagrams + +When an RFC needs diagrams to illustrate architecture, data flow, or component interactions, use [Mermaid](https://mermaid.js.org/) diagrams embedded directly in the Markdown. Mermaid renders natively on GitHub and keeps diagrams version-controlled alongside the text. + +````markdown +```mermaid +graph LR + A[Client] --> B[Gateway] + B --> C[Sandbox] +``` +```` + +Prefer Mermaid over external image files whenever possible. If a diagram is too complex for Mermaid (e.g., detailed UI mockups), commit the image to the same directory as the RFC and reference it with a relative path. + +## Making changes to an RFC + +After your RFC has been merged, there is always opportunity to make changes. Open a pull request with the change you would like to make. If you are not the original author, be sure to @ the original authors to make sure they see and approve of the changes. + +## RFC postponement + +Some RFCs are marked `rejected` when the proposal is declined or when we want neither to think about evaluating the proposal nor about implementing it until some time in the future. Rejected RFCs may be revisited when the time is right. From 0832f11a6639ad6d6c04ef3d40e892e15339a072 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:58:45 -0700 Subject: [PATCH 10/45] fix(e2e): add uv-managed python binary glob to forward proxy L7 test (#686) The base sandbox image installs Python via uv at /sandbox/.uv/python/*/bin/python*, but the proxy resolves binary identity via /proc/PID/exe (the real path, not the symlink). The test policy only listed /usr/bin/python* and /usr/local/bin/python*, so OPA denied the connection at L4 before L7 evaluation could run. Co-authored-by: John Myers --- e2e/rust/tests/forward_proxy_l7_bypass.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 89ecf2cd4..c3ae584b0 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -156,6 +156,7 @@ network_policies: - path: /usr/bin/curl - path: /usr/bin/python* - path: /usr/local/bin/python* + - path: /sandbox/.uv/python/*/bin/python* "# ); file.write_all(policy.as_bytes()) From 38655a65eca9489b717cee8c371845467f3be350 Mon Sep 17 00:00:00 2001 From: LateNightHackathon <256481314+latenighthackathon@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:12:29 -0500 Subject: [PATCH 11/45] fix(l7): reject requests with both CL and TE headers in inference parser (CWE-444) (#671) The CL/TE desynchronisation guard added in #663 for the REST path was not applied to the inference request parser. A request containing both Content-Length and Transfer-Encoding headers could be interpreted differently by the proxy and the upstream server, enabling HTTP request smuggling (CWE-444, RFC 7230 Section 3.3.3). Add the same rejection check and tests mirroring the REST parser coverage, including TE substring validation. Signed-off-by: latenighthackathon Co-authored-by: latenighthackathon --- crates/openshell-sandbox/src/l7/inference.rs | 57 ++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/crates/openshell-sandbox/src/l7/inference.rs b/crates/openshell-sandbox/src/l7/inference.rs index 140213f07..5136c8783 100644 --- a/crates/openshell-sandbox/src/l7/inference.rs +++ b/crates/openshell-sandbox/src/l7/inference.rs @@ -164,6 +164,12 @@ pub fn try_parse_http_request(buf: &[u8]) -> ParseResult { } } + if is_chunked && has_content_length { + return ParseResult::Invalid( + "Request contains both Transfer-Encoding and Content-Length headers".to_string(), + ); + } + let (body, consumed) = if is_chunked { let Some((decoded_body, consumed)) = parse_chunked_body(buf, body_start) else { return ParseResult::Incomplete; @@ -570,6 +576,24 @@ mod tests { assert_eq!(parsed.body.len(), 100); } + /// SEC: Transfer-Encoding substring match must not match partial tokens. + #[test] + fn te_substring_not_chunked() { + let body = r#"{"model":"m","messages":[]}"#; + let request = format!( + "POST /v1/chat/completions HTTP/1.1\r\n\ + Host: x\r\n\ + Transfer-Encoding: chunkedx\r\n\ + Content-Length: {}\r\n\ + \r\n{body}", + body.len(), + ); + let ParseResult::Complete(parsed, _) = try_parse_http_request(request.as_bytes()) else { + panic!("expected Complete for non-matching TE with valid CL"); + }; + assert_eq!(parsed.body.len(), body.len()); + } + // ---- SEC: Content-Length validation ---- #[test] @@ -608,4 +632,37 @@ mod tests { ParseResult::Invalid(_) )); } + + // ---- SEC-009: CL/TE desynchronisation ---- + + /// Reject requests with both Content-Length and Transfer-Encoding to + /// prevent CL/TE request smuggling (RFC 7230 Section 3.3.3). + #[test] + fn reject_dual_content_length_and_transfer_encoding() { + let request = b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nContent-Length: 5\r\nTransfer-Encoding: chunked\r\n\r\n"; + assert!( + matches!( + try_parse_http_request(request), + ParseResult::Invalid(reason) + if reason.contains("Transfer-Encoding") + && reason.contains("Content-Length") + ), + "Must reject request with both CL and TE" + ); + } + + /// Same rejection regardless of header order. + #[test] + fn reject_dual_transfer_encoding_and_content_length() { + let request = b"POST /v1/chat/completions HTTP/1.1\r\nHost: x\r\nTransfer-Encoding: chunked\r\nContent-Length: 5\r\n\r\n"; + assert!( + matches!( + try_parse_http_request(request), + ParseResult::Invalid(reason) + if reason.contains("Transfer-Encoding") + && reason.contains("Content-Length") + ), + "Must reject request with both TE and CL" + ); + } } From 758c62d18dccf7a3a92bae31ac62fe94d5bf7434 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:19:19 -0700 Subject: [PATCH 12/45] fix(sandbox): handle per-path Landlock errors instead of abandoning entire ruleset (#677) * fix(sandbox): handle per-path Landlock errors instead of abandoning entire ruleset A single missing path (e.g., /app in containers without that directory) caused PathFd::new() to propagate an error out of the entire Landlock setup closure. Under BestEffort mode, this silently disabled all filesystem restrictions for the sandbox. Changes: - Extract try_open_path() and classify_path_error() helpers that handle PathFd failures per-path instead of per-ruleset - BestEffort mode: skip inaccessible paths with a warning, apply remaining rules - HardRequirement mode: fail immediately on any inaccessible path - Add zero-rule safety check to prevent applying an empty ruleset that would block all filesystem access - Pre-filter system-injected baseline paths (e.g., /app) in enrichment functions so missing paths never reach Landlock - Add unit tests for try_open_path, classify_path_error, and error classification for ENOENT, EACCES, ELOOP, ENAMETOOLONG, ENOTDIR - Update user-facing docs and architecture docs with Landlock behavior tables, baseline path filtering, and compatibility mode semantics - Fix stale ABI::V1 references in docs (code uses ABI::V2) Closes #664 * fix(sandbox): use debug log for NotFound in Landlock best-effort mode NotFound errors for stale baseline paths (e.g. /app persisted in the server-stored policy but absent in this container) are expected in best-effort mode. Downgrade from warn! to debug! so the message does not leak into SSH exec stdout (the pre_exec hook inherits the tracing subscriber whose writer targets fd 1). Genuine errors (permission denied, symlink loops, etc.) remain at warn! for operator visibility. Also move custom_image e2e marker from /opt to /etc (a Landlock baseline read-only path) since the security fix now properly enforces filesystem restrictions. --------- Co-authored-by: John Myers --- architecture/sandbox.md | 12 +- architecture/security-policy.md | 14 +- crates/openshell-sandbox/src/lib.rs | 35 +++ .../src/sandbox/linux/landlock.rs | 231 ++++++++++++++++-- docs/reference/policy-schema.md | 15 +- docs/sandboxes/policies.md | 15 +- e2e/rust/tests/custom_image.rs | 6 +- 7 files changed, 293 insertions(+), 35 deletions(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index 1117d0f71..bfc71ba31 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -431,15 +431,21 @@ Landlock restricts the child process's filesystem access to an explicit allowlis 1. Build path lists from `filesystem.read_only` and `filesystem.read_write` 2. If `include_workdir` is true, add the working directory to `read_write` 3. If both lists are empty, skip Landlock entirely (no-op) -4. Create a Landlock ruleset targeting ABI V1: +4. Create a Landlock ruleset targeting ABI V2: - Read-only paths receive `AccessFs::from_read(abi)` rights - Read-write paths receive `AccessFs::from_all(abi)` rights -5. Call `ruleset.restrict_self()` -- this applies to the calling process and all descendants +5. For each path, attempt `PathFd::new()`. If it fails: + - `BestEffort`: Log a warning with the error classification (not found, permission denied, symlink loop, etc.) and skip the path. Continue building the ruleset from remaining valid paths. + - `HardRequirement`: Return a fatal error, aborting the sandbox. +6. If all paths failed (zero rules applied), return an error rather than calling `restrict_self()` on an empty ruleset (which would block all filesystem access) +7. Call `ruleset.restrict_self()` -- this applies to the calling process and all descendants -Error behavior depends on `LandlockCompatibility`: +Kernel-level error behavior (e.g., Landlock ABI unavailable) depends on `LandlockCompatibility`: - `BestEffort`: Log a warning and continue without filesystem isolation - `HardRequirement`: Return a fatal error, aborting the sandbox +**Baseline path filtering**: System-injected baseline paths (e.g., `/app`) are pre-filtered by `enrich_proto_baseline_paths()` / `enrich_sandbox_baseline_paths()` using `Path::exists()` before they reach Landlock. User-specified paths are not pre-filtered -- they are evaluated at Landlock apply time so misconfigurations surface as warnings or errors. + ### Seccomp syscall filtering **File:** `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs` diff --git a/architecture/security-policy.md b/architecture/security-policy.md index 8c41e5b91..8b7b61d21 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -320,7 +320,7 @@ Controls which filesystem paths the sandboxed process can access. Enforced via L | `read_only` | `string[]` | `[]` | Paths accessible in read-only mode | | `read_write` | `string[]` | `[]` | Paths accessible in read-write mode | -**Enforcement mapping**: Each path becomes a Landlock `PathBeneath` rule. Read-only paths receive `AccessFs::from_read(ABI::V1)` permissions. Read-write paths receive `AccessFs::from_all(ABI::V1)` permissions (read, write, execute, create, delete, rename). All other paths are denied by the Landlock ruleset. +**Enforcement mapping**: Each path becomes a Landlock `PathBeneath` rule. Read-only paths receive `AccessFs::from_read(ABI::V2)` permissions. Read-write paths receive `AccessFs::from_all(ABI::V2)` permissions (read, write, execute, create, delete, rename). All other paths are denied by the Landlock ruleset. **Filesystem preparation**: Before the child process spawns, the supervisor creates any `read_write` directories that do not exist and sets their ownership to `process.run_as_user`:`process.run_as_group` via `chown()`. See `crates/openshell-sandbox/src/lib.rs` -- `prepare_filesystem()`. @@ -358,10 +358,16 @@ Controls Landlock LSM compatibility behavior. **Static field** -- immutable afte | Value | Behavior | | ------------------ | --------------------------------------------------------------------------------------------------------------------------- | -| `best_effort` | If Landlock is unavailable (older kernel, unprivileged container), log a warning and continue without filesystem sandboxing | -| `hard_requirement` | If Landlock is unavailable, abort sandbox startup with an error | +| `best_effort` | If Landlock is unavailable (older kernel, unprivileged container), log a warning and continue without filesystem sandboxing. Individual inaccessible paths (missing, permission denied, symlink loops) are skipped with a warning while remaining rules are still applied. If all paths fail, the sandbox continues without Landlock rather than applying an empty ruleset that would block all access. | +| `hard_requirement` | If Landlock is unavailable or any configured path cannot be opened, abort sandbox startup with an error. | -See `crates/openshell-sandbox/src/sandbox/linux/landlock.rs` -- `compat_level()`. +**Per-path error handling**: `PathFd::new()` (which wraps `open(path, O_PATH | O_CLOEXEC)`) can fail for several reasons beyond path non-existence: `EACCES` (permission denied), `ELOOP` (symlink loop), `ENAMETOOLONG`, `ENOTDIR`. Each failure is classified with a human-readable reason in logs. In `best_effort` mode, the path is skipped and ruleset construction continues. In `hard_requirement` mode, the error is fatal. + +**Baseline path filtering**: The enrichment functions (`enrich_proto_baseline_paths`, `enrich_sandbox_baseline_paths`) pre-filter system-injected baseline paths (e.g., `/app`) by checking `Path::exists()` before adding them to the policy. This prevents missing baseline paths from reaching Landlock at all. User-specified paths are not pre-filtered — they are evaluated at Landlock apply time so that misconfigurations surface as warnings (`best_effort`) or errors (`hard_requirement`). + +**Zero-rule safety check**: If all paths in the ruleset fail to open, `apply()` returns an error rather than calling `restrict_self()` on an empty ruleset. An empty Landlock ruleset with `restrict_self()` would block all filesystem access — the inverse of the intended degradation behavior. This error is caught by the outer `BestEffort` handler, which logs a warning and continues without Landlock. + +See `crates/openshell-sandbox/src/sandbox/linux/landlock.rs` -- `compat_level()`, `try_open_path()`, `classify_path_fd_error()`, `classify_io_error()`. ```yaml landlock: diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 493e4d237..297d7fc38 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -899,12 +899,31 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) let mut modified = false; for &path in PROXY_BASELINE_READ_ONLY { if !fs.read_only.iter().any(|p| p.as_str() == path) { + // Baseline paths are system-injected, not user-specified. Skip + // paths that do not exist in this container image to avoid noisy + // warnings from Landlock and, more critically, to prevent a single + // missing baseline path from abandoning the entire Landlock + // ruleset under best-effort mode (see issue #664). + if !std::path::Path::new(path).exists() { + debug!( + path, + "Baseline read-only path does not exist, skipping enrichment" + ); + continue; + } fs.read_only.push(path.to_string()); modified = true; } } for &path in PROXY_BASELINE_READ_WRITE { if !fs.read_write.iter().any(|p| p.as_str() == path) { + if !std::path::Path::new(path).exists() { + debug!( + path, + "Baseline read-write path does not exist, skipping enrichment" + ); + continue; + } fs.read_write.push(path.to_string()); modified = true; } @@ -929,6 +948,15 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { for &path in PROXY_BASELINE_READ_ONLY { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_only.contains(&p) { + // Baseline paths are system-injected — skip non-existent paths to + // avoid Landlock ruleset abandonment (issue #664). + if !p.exists() { + debug!( + path, + "Baseline read-only path does not exist, skipping enrichment" + ); + continue; + } policy.filesystem.read_only.push(p); modified = true; } @@ -936,6 +964,13 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { for &path in PROXY_BASELINE_READ_WRITE { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_write.contains(&p) { + if !p.exists() { + debug!( + path, + "Baseline read-write path does not exist, skipping enrichment" + ); + continue; + } policy.filesystem.read_write.push(p); modified = true; } diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index e276840dd..abb91fd4f 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -5,11 +5,11 @@ use crate::policy::{LandlockCompatibility, SandboxPolicy}; use landlock::{ - ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, Ruleset, RulesetAttr, - RulesetCreatedAttr, + ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, + RulesetAttr, RulesetCreatedAttr, }; use miette::{IntoDiagnostic, Result}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tracing::{debug, info, warn}; pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { @@ -29,6 +29,7 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { return Ok(()); } + let total_paths = read_only.len() + read_write.len(); let abi = ABI::V2; info!( abi = ?abi, @@ -38,47 +39,61 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { "Applying Landlock filesystem sandbox" ); + let compatibility = &policy.landlock.compatibility; + let result: Result<()> = (|| { let access_all = AccessFs::from_all(abi); let access_read = AccessFs::from_read(abi); let mut ruleset = Ruleset::default(); ruleset = ruleset - .set_compatibility(compat_level(&policy.landlock.compatibility)) + .set_compatibility(compat_level(compatibility)) .handle_access(access_all) .into_diagnostic()?; let mut ruleset = ruleset.create().into_diagnostic()?; + let mut rules_applied: usize = 0; + + for path in &read_only { + if let Some(path_fd) = try_open_path(path, compatibility)? { + debug!(path = %path.display(), "Landlock allow read-only"); + ruleset = ruleset + .add_rule(PathBeneath::new(path_fd, access_read)) + .into_diagnostic()?; + rules_applied += 1; + } + } - for path in read_only { - debug!(path = %path.display(), "Landlock allow read-only"); - ruleset = ruleset - .add_rule(PathBeneath::new( - PathFd::new(path).into_diagnostic()?, - access_read, - )) - .into_diagnostic()?; + for path in &read_write { + if let Some(path_fd) = try_open_path(path, compatibility)? { + debug!(path = %path.display(), "Landlock allow read-write"); + ruleset = ruleset + .add_rule(PathBeneath::new(path_fd, access_all)) + .into_diagnostic()?; + rules_applied += 1; + } } - for path in read_write { - debug!(path = %path.display(), "Landlock allow read-write"); - ruleset = ruleset - .add_rule(PathBeneath::new( - PathFd::new(path).into_diagnostic()?, - access_all, - )) - .into_diagnostic()?; + if rules_applied == 0 { + return Err(miette::miette!( + "Landlock ruleset has zero valid paths — all {} path(s) failed to open. \ + Refusing to apply an empty ruleset that would block all filesystem access.", + total_paths, + )); } + let skipped = total_paths - rules_applied; + info!( + rules_applied, + skipped, "Landlock ruleset built successfully" + ); + ruleset.restrict_self().into_diagnostic()?; Ok(()) })(); if let Err(err) = result { - if matches!( - policy.landlock.compatibility, - LandlockCompatibility::BestEffort - ) { + if matches!(compatibility, LandlockCompatibility::BestEffort) { warn!( error = %err, "Landlock filesystem sandbox is UNAVAILABLE — running WITHOUT filesystem restrictions. \ @@ -92,9 +107,177 @@ pub fn apply(policy: &SandboxPolicy, workdir: Option<&str>) -> Result<()> { Ok(()) } +/// Attempt to open a path for Landlock rule creation. +/// +/// In `BestEffort` mode, inaccessible paths (missing, permission denied, symlink +/// loops, etc.) are skipped with a warning and `Ok(None)` is returned so the +/// caller can continue building the ruleset from the remaining valid paths. +/// +/// In `HardRequirement` mode, any failure is fatal — the caller propagates the +/// error, which ultimately aborts sandbox startup. +fn try_open_path(path: &Path, compatibility: &LandlockCompatibility) -> Result> { + match PathFd::new(path) { + Ok(fd) => Ok(Some(fd)), + Err(err) => { + let reason = classify_path_fd_error(&err); + let is_not_found = matches!( + &err, + PathFdError::OpenCall { source, .. } + if source.kind() == std::io::ErrorKind::NotFound + ); + match compatibility { + LandlockCompatibility::BestEffort => { + // NotFound is expected for stale baseline paths (e.g. + // /app baked into the server-stored policy but absent + // in this container image). Log at debug! to avoid + // polluting SSH exec stdout — the pre_exec hook + // inherits the tracing subscriber whose writer targets + // fd 1 (the pipe/PTY). + // + // Other errors (permission denied, symlink loops, etc.) + // are genuinely unexpected and logged at warn!. + if is_not_found { + debug!( + path = %path.display(), + reason, + "Skipping non-existent Landlock path (best-effort mode)" + ); + } else { + warn!( + path = %path.display(), + error = %err, + reason, + "Skipping inaccessible Landlock path (best-effort mode)" + ); + } + Ok(None) + } + LandlockCompatibility::HardRequirement => Err(miette::miette!( + "Landlock path unavailable in hard_requirement mode: {} ({}): {}", + path.display(), + reason, + err, + )), + } + } + } +} + +/// Classify a [`PathFdError`] into a human-readable reason. +/// +/// `PathFd::new()` wraps `open(path, O_PATH | O_CLOEXEC)` which can fail for +/// several reasons beyond simple non-existence. The `PathFdError::OpenCall` +/// variant wraps the underlying `std::io::Error`. +fn classify_path_fd_error(err: &PathFdError) -> &'static str { + match err { + PathFdError::OpenCall { source, .. } => classify_io_error(source), + // PathFdError is #[non_exhaustive], handle future variants gracefully. + _ => "unexpected error", + } +} + +/// Classify a `std::io::Error` into a human-readable reason string. +fn classify_io_error(err: &std::io::Error) -> &'static str { + match err.kind() { + std::io::ErrorKind::NotFound => "path does not exist", + std::io::ErrorKind::PermissionDenied => "permission denied", + _ => match err.raw_os_error() { + Some(40) => "too many symlink levels", // ELOOP + Some(36) => "path name too long", // ENAMETOOLONG + Some(20) => "path component is not a directory", // ENOTDIR + _ => "unexpected error", + }, + } +} + fn compat_level(level: &LandlockCompatibility) -> CompatLevel { match level { LandlockCompatibility::BestEffort => CompatLevel::BestEffort, LandlockCompatibility::HardRequirement => CompatLevel::HardRequirement, } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn try_open_path_best_effort_returns_none_for_missing_path() { + let result = try_open_path( + &PathBuf::from("/nonexistent/openshell/test/path"), + &LandlockCompatibility::BestEffort, + ); + assert!(result.is_ok()); + assert!(result.unwrap().is_none()); + } + + #[test] + fn try_open_path_hard_requirement_errors_for_missing_path() { + let result = try_open_path( + &PathBuf::from("/nonexistent/openshell/test/path"), + &LandlockCompatibility::HardRequirement, + ); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("hard_requirement"), + "error should mention hard_requirement mode: {err_msg}" + ); + assert!( + err_msg.contains("does not exist"), + "error should include the classified reason: {err_msg}" + ); + } + + #[test] + fn try_open_path_succeeds_for_existing_path() { + let dir = tempfile::tempdir().unwrap(); + let result = try_open_path(dir.path(), &LandlockCompatibility::BestEffort); + assert!(result.is_ok()); + assert!(result.unwrap().is_some()); + } + + #[test] + fn classify_not_found() { + let err = std::io::Error::from_raw_os_error(libc::ENOENT); + assert_eq!(classify_io_error(&err), "path does not exist"); + } + + #[test] + fn classify_permission_denied() { + let err = std::io::Error::from_raw_os_error(libc::EACCES); + assert_eq!(classify_io_error(&err), "permission denied"); + } + + #[test] + fn classify_symlink_loop() { + let err = std::io::Error::from_raw_os_error(libc::ELOOP); + assert_eq!(classify_io_error(&err), "too many symlink levels"); + } + + #[test] + fn classify_name_too_long() { + let err = std::io::Error::from_raw_os_error(libc::ENAMETOOLONG); + assert_eq!(classify_io_error(&err), "path name too long"); + } + + #[test] + fn classify_not_a_directory() { + let err = std::io::Error::from_raw_os_error(libc::ENOTDIR); + assert_eq!(classify_io_error(&err), "path component is not a directory"); + } + + #[test] + fn classify_unknown_error() { + let err = std::io::Error::from_raw_os_error(libc::EIO); + assert_eq!(classify_io_error(&err), "unexpected error"); + } + + #[test] + fn classify_path_fd_error_extracts_io_error() { + // Use PathFd::new on a non-existent path to get a real PathFdError + // (the OpenCall variant is #[non_exhaustive] and can't be constructed directly). + let err = PathFd::new("/nonexistent/openshell/classify/test").unwrap_err(); + assert_eq!(classify_path_fd_error(&err), "path does not exist"); + } +} diff --git a/docs/reference/policy-schema.md b/docs/reference/policy-schema.md index cb37d0bae..6916e8d0c 100644 --- a/docs/reference/policy-schema.md +++ b/docs/reference/policy-schema.md @@ -105,7 +105,20 @@ Configures [Landlock LSM](https://docs.kernel.org/security/landlock.html) enforc | Field | Type | Required | Values | Description | |---|---|---|---|---| -| `compatibility` | string | No | `best_effort`, `hard_requirement` | How OpenShell handles kernel ABI differences. `best_effort` uses the highest Landlock ABI the host kernel supports. `hard_requirement` fails if the required ABI is unavailable. | +| `compatibility` | string | No | `best_effort`, `hard_requirement` | How OpenShell handles Landlock failures. See behavior table below. | + +**Compatibility modes:** + +| Value | Kernel ABI unavailable | Individual path inaccessible | All paths inaccessible | +|---|---|---|---| +| `best_effort` | Warns and continues without Landlock. | Skips the path, applies remaining rules. | Warns and continues without Landlock (refuses to apply an empty ruleset). | +| `hard_requirement` | Aborts sandbox startup. | Aborts sandbox startup. | Aborts sandbox startup. | + +`best_effort` (the default) is appropriate for most deployments. It handles missing paths gracefully -- for example, `/app` may not exist in every container image but is included in the baseline path set for containers that do have it. Individual missing paths are skipped while the remaining filesystem rules are still enforced. + +`hard_requirement` is for environments where any gap in filesystem isolation is unacceptable. If a listed path cannot be opened for any reason (missing, permission denied, symlink loop), sandbox startup fails immediately rather than running with reduced protection. + +When a path is skipped under `best_effort`, the sandbox logs a warning that includes the path, the specific error, and a human-readable reason (for example, "path does not exist" or "permission denied"). Example: diff --git a/docs/sandboxes/policies.md b/docs/sandboxes/policies.md index 565a7a4c9..fa5ed5d8b 100644 --- a/docs/sandboxes/policies.md +++ b/docs/sandboxes/policies.md @@ -70,10 +70,23 @@ Dynamic sections can be updated on a running sandbox with `openshell policy set` | Section | Type | Description | |---|---|---| | `filesystem_policy` | Static | Controls which directories the agent can access on disk. Paths are split into `read_only` and `read_write` lists. Any path not listed in either list is inaccessible. Set `include_workdir: true` to automatically add the agent's working directory to `read_write`. [Landlock LSM](https://docs.kernel.org/security/landlock.html) enforces these restrictions at the kernel level. | -| `landlock` | Static | Configures Landlock LSM enforcement behavior. Set `compatibility` to `best_effort` (use the highest ABI the host kernel supports) or `hard_requirement` (fail if the required ABI is unavailable). | +| `landlock` | Static | Configures Landlock LSM enforcement behavior. Set `compatibility` to `best_effort` (skip individual inaccessible paths while applying remaining rules) or `hard_requirement` (fail if any path is inaccessible or the required kernel ABI is unavailable). See the [Policy Schema Reference](../reference/policy-schema.md#landlock) for the full behavior table. | | `process` | Static | Sets the OS-level identity for the agent process. `run_as_user` and `run_as_group` default to `sandbox`. Root (`root` or `0`) is rejected. The agent also runs with seccomp filters that block dangerous system calls. | | `network_policies` | Dynamic | Controls network access for ordinary outbound traffic from the sandbox. Each block has a name, a list of endpoints (host, port, protocol, and optional rules), and a list of binaries allowed to use those endpoints.
Every outbound connection except `https://inference.local` goes through the proxy, which queries the {doc}`policy engine <../about/architecture>` with the destination and calling binary. A connection is allowed only when both match an entry in the same policy block.
For endpoints with `protocol: rest`, the proxy auto-detects TLS and terminates it so each HTTP request is checked against that endpoint's `rules` (method and path).
Endpoints without `protocol` allow the TCP stream through without inspecting payloads.
If no endpoint matches, the connection is denied. Configure managed inference separately through {doc}`../inference/configure`. | +## Baseline Filesystem Paths + +When a sandbox runs in proxy mode (the default), OpenShell automatically adds baseline filesystem paths required for the sandbox child process to function: `/usr`, `/lib`, `/etc`, `/var/log` (read-only) and `/sandbox`, `/tmp` (read-write). Paths like `/app` are included in the baseline set but are only added if they exist in the container image. + +This filtering prevents a missing baseline path from degrading Landlock enforcement. Without it, a single missing path could cause the entire Landlock ruleset to fail, leaving the sandbox with no filesystem restrictions at all. + +User-specified paths in your policy YAML are not pre-filtered. If you list a path that does not exist: + +- In `best_effort` mode, the path is skipped with a warning and remaining rules are still applied. +- In `hard_requirement` mode, sandbox startup fails immediately. + +This distinction means baseline system paths degrade gracefully while user-specified paths surface configuration errors. + ## Apply a Custom Policy Pass a policy YAML file when creating the sandbox: diff --git a/e2e/rust/tests/custom_image.rs b/e2e/rust/tests/custom_image.rs index 14fc3f47a..10cf99091 100644 --- a/e2e/rust/tests/custom_image.rs +++ b/e2e/rust/tests/custom_image.rs @@ -26,7 +26,9 @@ RUN groupadd -g 1000 sandbox && \ useradd -m -u 1000 -g sandbox sandbox # Write a marker file so we can verify this is our custom image. -RUN echo "custom-image-e2e-marker" > /opt/marker.txt +# Place under /etc (Landlock baseline read-only path) so the sandbox +# can read it when filesystem restrictions are properly enforced. +RUN echo "custom-image-e2e-marker" > /etc/marker.txt CMD ["sleep", "infinity"] "#; @@ -53,7 +55,7 @@ async fn sandbox_from_custom_dockerfile() { dockerfile_str, "--", "cat", - "/opt/marker.txt", + "/etc/marker.txt", ]) .await .expect("sandbox create from Dockerfile"); From 8c4b17221c76127e5c893559b80ae52759faab0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vinicius=20Corr=C3=AAa?= Date: Mon, 30 Mar 2026 17:50:25 -0300 Subject: [PATCH 13/45] Missed input parameter (#645) Missed "--from openclaw" input parameter. Without this input sandbox container didn't install "openclaw": 'Stop with: openshell forward stop 18789 eager-dragonfly '/bin/bash: line 1: openclaw-start: command not found --- examples/openclaw.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/openclaw.md b/examples/openclaw.md index c1ccd2cd4..f098e139e 100644 --- a/examples/openclaw.md +++ b/examples/openclaw.md @@ -3,7 +3,7 @@ ## Quick start ```sh -openshell sandbox create --forward 18789 -- openclaw-start +openshell sandbox create --forward 18789 --from openclaw -- openclaw-start ``` `openclaw-start` is a helper script pre-installed in the sandbox that runs the @@ -25,7 +25,7 @@ Note: you will need use the auth token present in the bootstrapping process to c ### Create the sandbox ```sh -openshell sandbox create --forward 18789 +openshell sandbox create --forward 18789 --from openclaw ``` Inside the sandbox, run the onboarding wizard and start the gateway: From e8950e624c7e98e60e624d25e7d57d0003a8b61c Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:53:12 -0700 Subject: [PATCH 14/45] feat(sandbox): add L7 query parameter matchers (#617) * feat(sandbox): add L7 query parameter matchers Signed-off-by: John Myers <9696606+johntmyers@users.noreply.github.com> * fix(sandbox): decode + as space in query params and validate glob syntax Three improvements from PR #617 review: 1. Decode + as space in query string values per the application/x-www-form-urlencoded convention. This matches Python's urllib.parse, JavaScript's URLSearchParams, Go's url.ParseQuery, and most HTTP frameworks. Literal + should be sent as %2B. 2. Add glob pattern syntax validation (warnings) for query matchers. Checks for unclosed brackets and braces in glob/any patterns. These are warnings (not errors) because OPA's glob.match is forgiving, but they surface likely typos during policy loading. 3. Add missing test cases: empty query values, keys without values, unicode after percent-decoding, empty query strings, and literal + via %2B encoding. * fix(sandbox): add missing query_params field in forward proxy L7 request info * style(sandbox): fix formatting in proxy L7 query param parsing --------- Signed-off-by: John Myers <9696606+johntmyers@users.noreply.github.com> Co-authored-by: John Myers --- architecture/sandbox.md | 6 +- architecture/security-policy.md | 8 +- crates/openshell-policy/src/lib.rs | 94 ++++- .../data/sandbox-policy.rego | 50 +++ crates/openshell-sandbox/src/l7/mod.rs | 351 +++++++++++++++++- crates/openshell-sandbox/src/l7/provider.rs | 3 + crates/openshell-sandbox/src/l7/relay.rs | 3 + crates/openshell-sandbox/src/l7/rest.rs | 202 +++++++++- .../src/mechanistic_mapper.rs | 1 + crates/openshell-sandbox/src/opa.rs | 209 ++++++++++- crates/openshell-sandbox/src/proxy.rs | 5 +- docs/reference/policy-schema.md | 6 + docs/sandboxes/policies.md | 26 ++ e2e/python/test_sandbox_policy.py | 246 ++++++++++++ proto/sandbox.proto | 12 + 15 files changed, 1202 insertions(+), 20 deletions(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index bfc71ba31..333cef5ea 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -968,7 +968,7 @@ flowchart LR | `EnforcementMode` | `Audit`, `Enforce` | What to do on L7 deny (log-only vs block) | | `L7EndpointConfig` | `{ protocol, tls, enforcement }` | Per-endpoint L7 configuration | | `L7Decision` | `{ allowed, reason, matched_rule }` | Result of L7 evaluation | -| `L7RequestInfo` | `{ action, target }` | HTTP method + path for policy evaluation | +| `L7RequestInfo` | `{ action, target, query_params }` | HTTP method, path, and decoded query multimap for policy evaluation | ### Access presets @@ -1047,7 +1047,7 @@ This enables credential injection on all HTTPS endpoints automatically, without Implements `L7Provider` for HTTP/1.1: -- **`parse_request()`**: Reads up to 16 KiB of headers, parses the request line (method, path), determines body framing from `Content-Length` or `Transfer-Encoding: chunked` headers. Returns `L7Request` with raw header bytes (may include overflow body bytes). +- **`parse_request()`**: Reads up to 16 KiB of headers, parses the request line (method, path), decodes query parameters into a multimap, determines body framing from `Content-Length` or `Transfer-Encoding: chunked` headers. Returns `L7Request` with raw header bytes (may include overflow body bytes). - **`relay()`**: Forwards request headers and body to upstream (handling Content-Length, chunked, and no-body cases), then reads and relays the full response back to the client. @@ -1060,7 +1060,7 @@ Implements `L7Provider` for HTTP/1.1: `relay_with_inspection()` in `crates/openshell-sandbox/src/l7/relay.rs` is the main relay loop: 1. Parse one HTTP request from client via the provider -2. Build L7 input JSON with `request.method`, `request.path`, plus the CONNECT-level context (host, port, binary, ancestors, cmdline) +2. Build L7 input JSON with `request.method`, `request.path`, `request.query_params`, plus the CONNECT-level context (host, port, binary, ancestors, cmdline) 3. Evaluate `data.openshell.sandbox.allow_request` and `data.openshell.sandbox.request_deny_reason` 4. Log the L7 decision (tagged `L7_REQUEST`) 5. If allowed (or audit mode): relay request to upstream and response back to client, then loop diff --git a/architecture/security-policy.md b/architecture/security-policy.md index 8b7b61d21..555ba67a5 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -467,9 +467,14 @@ rules: - allow: method: GET path: "/repos/**" + query: + per_page: "1*" - allow: method: POST path: "/repos/*/issues" + query: + labels: + any: ["bug*", "p1*"] ``` #### `L7Allow` @@ -479,8 +484,9 @@ rules: | `method` | `string` | HTTP method: `GET`, `HEAD`, `POST`, `PUT`, `DELETE`, `PATCH`, `OPTIONS`, or `*` (any). Case-insensitive matching. | | `path` | `string` | URL path glob pattern: `**` matches everything, otherwise `glob.match` with `/` delimiter. | | `command` | `string` | SQL command: `SELECT`, `INSERT`, `UPDATE`, `DELETE`, or `*` (any). Case-insensitive matching. For `protocol: sql` endpoints. | +| `query` | `map` | Optional REST query rules keyed by decoded query param name. Value is either a glob string (for example, `tag: "foo-*"`) or `{ any: ["foo-*", "bar-*"] }`. | -Method and command fields use `*` as wildcard for "any". Path patterns use `**` for "match everything" and standard glob patterns with `/` as a delimiter otherwise. See `sandbox-policy.rego` -- `method_matches()`, `path_matches()`, `command_matches()`. +Method and command fields use `*` as wildcard for "any". Path patterns use `**` for "match everything" and standard glob patterns with `/` as a delimiter otherwise. Query matching is case-sensitive and evaluates decoded values; when duplicate keys are present in the request, every value for that key must match the configured matcher. See `sandbox-policy.rego` -- `method_matches()`, `path_matches()`, `command_matches()`, `query_params_match()`. #### Access Presets diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index f1c15539e..7adb4dfda 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -15,8 +15,8 @@ use std::path::Path; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ - FilesystemPolicy, L7Allow, L7Rule, LandlockPolicy, NetworkBinary, NetworkEndpoint, - NetworkPolicyRule, ProcessPolicy, SandboxPolicy, + FilesystemPolicy, L7Allow, L7QueryMatcher, L7Rule, LandlockPolicy, NetworkBinary, + NetworkEndpoint, NetworkPolicyRule, ProcessPolicy, SandboxPolicy, }; use serde::{Deserialize, Serialize}; @@ -120,6 +120,22 @@ struct L7AllowDef { path: String, #[serde(default, skip_serializing_if = "String::is_empty")] command: String, + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + query: BTreeMap, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +enum QueryMatcherDef { + Glob(String), + Any(QueryAnyDef), +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +struct QueryAnyDef { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + any: Vec, } #[derive(Debug, Serialize, Deserialize)] @@ -176,6 +192,23 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { method: r.allow.method, path: r.allow.path, command: r.allow.command, + query: r + .allow + .query + .into_iter() + .map(|(key, matcher)| { + let proto = match matcher { + QueryMatcherDef::Glob(glob) => { + L7QueryMatcher { glob, any: vec![] } + } + QueryMatcherDef::Any(any) => L7QueryMatcher { + glob: String::new(), + any: any.any, + }, + }; + (key, proto) + }) + .collect(), }), }) .collect(), @@ -275,6 +308,20 @@ fn from_proto(policy: &SandboxPolicy) -> PolicyFile { method: a.method, path: a.path, command: a.command, + query: a + .query + .into_iter() + .map(|(key, matcher)| { + let yaml_matcher = if !matcher.any.is_empty() { + QueryMatcherDef::Any(QueryAnyDef { + any: matcher.any, + }) + } else { + QueryMatcherDef::Glob(matcher.glob) + }; + (key, yaml_matcher) + }) + .collect(), }, } }) @@ -754,6 +801,49 @@ network_policies: assert_eq!(rule.binaries[0].path, "/usr/bin/curl"); } + #[test] + fn parse_l7_query_matchers_and_round_trip() { + let yaml = r#" +version: 1 +network_policies: + query_test: + name: query_test + endpoints: + - host: api.example.com + port: 8080 + protocol: rest + rules: + - allow: + method: GET + path: /download + query: + slug: "my-*" + tag: + any: ["foo-*", "bar-*"] + binaries: + - path: /usr/bin/curl +"#; + let proto = parse_sandbox_policy(yaml).expect("parse failed"); + let allow = proto.network_policies["query_test"].endpoints[0].rules[0] + .allow + .as_ref() + .expect("allow"); + assert_eq!(allow.query["slug"].glob, "my-*"); + assert_eq!(allow.query["slug"].any, Vec::::new()); + assert_eq!(allow.query["tag"].any, vec!["foo-*", "bar-*"]); + assert!(allow.query["tag"].glob.is_empty()); + + let yaml_out = serialize_sandbox_policy(&proto).expect("serialize failed"); + let proto_round_trip = parse_sandbox_policy(&yaml_out).expect("re-parse failed"); + let allow_round_trip = proto_round_trip.network_policies["query_test"].endpoints[0].rules + [0] + .allow + .as_ref() + .expect("allow"); + assert_eq!(allow_round_trip.query["slug"].glob, "my-*"); + assert_eq!(allow_round_trip.query["tag"].any, vec!["foo-*", "bar-*"]); + } + #[test] fn parse_rejects_unknown_fields() { let yaml = "version: 1\nbogus_field: true\n"; diff --git a/crates/openshell-sandbox/data/sandbox-policy.rego b/crates/openshell-sandbox/data/sandbox-policy.rego index 1544dfe55..0a7a33888 100644 --- a/crates/openshell-sandbox/data/sandbox-policy.rego +++ b/crates/openshell-sandbox/data/sandbox-policy.rego @@ -208,6 +208,7 @@ request_allowed_for_endpoint(request, endpoint) if { rule.allow.method method_matches(request.method, rule.allow.method) path_matches(request.path, rule.allow.path) + query_params_match(request, rule) } # --- L7 rule matching: SQL command --- @@ -235,6 +236,55 @@ path_matches(actual, pattern) if { glob.match(pattern, ["/"], actual) } +# Query matching: +# - If no query rules are configured, allow any query params. +# - For configured keys, all request values for that key must match. +# - Matcher shape supports either `glob` or `any`. +query_params_match(request, rule) if { + query_rules := object.get(rule.allow, "query", {}) + not query_mismatch(request, query_rules) +} + +query_mismatch(request, query_rules) if { + some key + matcher := query_rules[key] + not query_key_matches(request, key, matcher) +} + +query_key_matches(request, key, matcher) if { + request_query := object.get(request, "query_params", {}) + values := object.get(request_query, key, null) + values != null + count(values) > 0 + not query_value_mismatch(values, matcher) +} + +query_value_mismatch(values, matcher) if { + some i + value := values[i] + not query_value_matches(value, matcher) +} + +query_value_matches(value, matcher) if { + is_string(matcher) + glob.match(matcher, [], value) +} + +query_value_matches(value, matcher) if { + is_object(matcher) + glob_pattern := object.get(matcher, "glob", "") + glob_pattern != "" + glob.match(glob_pattern, [], value) +} + +query_value_matches(value, matcher) if { + is_object(matcher) + any_patterns := object.get(matcher, "any", []) + count(any_patterns) > 0 + some i + glob.match(any_patterns[i], [], value) +} + # SQL command matching: "*" matches any; otherwise case-insensitive. command_matches(_, "*") if true diff --git a/crates/openshell-sandbox/src/l7/mod.rs b/crates/openshell-sandbox/src/l7/mod.rs index 09e547885..880b6fd9e 100644 --- a/crates/openshell-sandbox/src/l7/mod.rs +++ b/crates/openshell-sandbox/src/l7/mod.rs @@ -76,6 +76,8 @@ pub struct L7RequestInfo { pub action: String, /// Target: URL path for REST, or empty for SQL. pub target: String, + /// Decoded query parameter multimap for REST requests. + pub query_params: std::collections::HashMap>, } /// Parse an L7 endpoint config from a regorus Value (returned by Rego query). @@ -144,6 +146,49 @@ fn get_object_str(val: ®orus::Value, key: &str) -> Option { } } +/// Check a glob pattern for obvious syntax issues. +/// +/// Returns `Some(warning_message)` if the pattern looks malformed. +/// OPA's `glob.match` is forgiving, so these are warnings (not errors) +/// to surface likely typos without blocking policy loading. +fn check_glob_syntax(pattern: &str) -> Option { + let mut bracket_depth: i32 = 0; + for c in pattern.chars() { + match c { + '[' => bracket_depth += 1, + ']' => { + if bracket_depth == 0 { + return Some(format!("glob pattern '{pattern}' has unmatched ']'")); + } + bracket_depth -= 1; + } + _ => {} + } + } + if bracket_depth > 0 { + return Some(format!("glob pattern '{pattern}' has unclosed '['")); + } + + let mut brace_depth: i32 = 0; + for c in pattern.chars() { + match c { + '{' => brace_depth += 1, + '}' => { + if brace_depth == 0 { + return Some(format!("glob pattern '{pattern}' has unmatched '}}'")); + } + brace_depth -= 1; + } + _ => {} + } + } + if brace_depth > 0 { + return Some(format!("glob pattern '{pattern}' has unclosed '{{'")); + } + + None +} + /// Validate L7 policy configuration in the loaded OPA data. /// /// Returns a list of errors and warnings. Errors should prevent sandbox startup; @@ -279,7 +324,7 @@ pub fn validate_l7_policies(data_json: &serde_json::Value) -> (Vec, Vec< "GET", "HEAD", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "*", ]; if let Some(rules) = ep.get("rules").and_then(|v| v.as_array()) { - for rule in rules { + for (rule_idx, rule) in rules.iter().enumerate() { if let Some(method) = rule .get("allow") .and_then(|a| a.get("method")) @@ -291,6 +336,110 @@ pub fn validate_l7_policies(data_json: &serde_json::Value) -> (Vec, Vec< "{loc}: Unknown HTTP method '{method}'. Standard methods: GET, HEAD, POST, PUT, DELETE, PATCH, OPTIONS." )); } + + let Some(query) = rule + .get("allow") + .and_then(|a| a.get("query")) + .filter(|v| !v.is_null()) + else { + continue; + }; + + let Some(query_obj) = query.as_object() else { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query: expected map of query matchers" + )); + continue; + }; + + for (param, matcher) in query_obj { + if let Some(glob_str) = matcher.as_str() { + if let Some(warning) = check_glob_syntax(glob_str) { + warnings.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}: {warning}" + )); + } + continue; + } + + let Some(matcher_obj) = matcher.as_object() else { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}: expected string glob or object with `any`" + )); + continue; + }; + + let has_any = matcher_obj.get("any").is_some(); + let has_glob = matcher_obj.get("glob").is_some(); + let has_unknown = matcher_obj.keys().any(|k| k != "any" && k != "glob"); + if has_unknown { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}: unknown matcher keys; only `glob` or `any` are supported" + )); + continue; + } + + if has_glob && has_any { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}: matcher cannot specify both `glob` and `any`" + )); + continue; + } + + if !has_glob && !has_any { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}: object matcher requires `glob` string or non-empty `any` list" + )); + continue; + } + + if has_glob { + match matcher_obj.get("glob").and_then(|v| v.as_str()) { + None => { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.glob: expected glob string" + )); + } + Some(g) => { + if let Some(warning) = check_glob_syntax(g) { + warnings.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.glob: {warning}" + )); + } + } + } + continue; + } + + let any = matcher_obj.get("any").and_then(|v| v.as_array()); + let Some(any) = any else { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.any: expected array of glob strings" + )); + continue; + }; + + if any.is_empty() { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.any: list must not be empty" + )); + continue; + } + + if any.iter().any(|v| v.as_str().is_none()) { + errors.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.any: all values must be strings" + )); + } + + for item in any.iter().filter_map(|v| v.as_str()) { + if let Some(warning) = check_glob_syntax(item) { + warnings.push(format!( + "{loc}.rules[{rule_idx}].allow.query.{param}.any: {warning}" + )); + } + } + } } } } @@ -780,4 +929,204 @@ mod tests { "should have no tls warnings with auto-detect: {warnings:?}" ); } + + #[test] + fn validate_query_any_requires_non_empty_array() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "tag": { "any": [] } + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, _warnings) = validate_l7_policies(&data); + assert!( + errors.iter().any(|e| e.contains("allow.query.tag.any")), + "expected query any validation error, got: {errors:?}" + ); + } + + #[test] + fn validate_query_object_rejects_unknown_keys() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "tag": { "mode": "foo-*" } + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, _warnings) = validate_l7_policies(&data); + assert!( + errors.iter().any(|e| e.contains("unknown matcher keys")), + "expected unknown query matcher key error, got: {errors:?}" + ); + } + + #[test] + fn validate_query_glob_warns_on_unclosed_bracket() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "tag": "[unclosed" + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, warnings) = validate_l7_policies(&data); + assert!( + errors.is_empty(), + "malformed glob should warn, not error: {errors:?}" + ); + assert!( + warnings + .iter() + .any(|w| w.contains("unclosed '['") && w.contains("allow.query.tag")), + "expected glob syntax warning, got: {warnings:?}" + ); + } + + #[test] + fn validate_query_glob_warns_on_unclosed_brace() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "format": { "glob": "{json,xml" } + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, warnings) = validate_l7_policies(&data); + assert!( + errors.is_empty(), + "malformed glob should warn, not error: {errors:?}" + ); + assert!( + warnings + .iter() + .any(|w| w.contains("unclosed '{'") && w.contains("allow.query.format.glob")), + "expected glob syntax warning, got: {warnings:?}" + ); + } + + #[test] + fn validate_query_any_warns_on_malformed_glob_item() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "tag": { "any": ["valid-*", "[bad"] } + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, warnings) = validate_l7_policies(&data); + assert!( + errors.is_empty(), + "malformed glob in any should warn, not error: {errors:?}" + ); + assert!( + warnings + .iter() + .any(|w| w.contains("unclosed '['") && w.contains("allow.query.tag.any")), + "expected glob syntax warning for any item, got: {warnings:?}" + ); + } + + #[test] + fn validate_query_string_and_any_matchers_are_accepted() { + let data = serde_json::json!({ + "network_policies": { + "test": { + "endpoints": [{ + "host": "api.example.com", + "port": 8080, + "protocol": "rest", + "rules": [{ + "allow": { + "method": "GET", + "path": "/download", + "query": { + "slug": "my-*", + "tag": { "any": ["foo-*", "bar-*"] }, + "owner": { "glob": "org-*" } + } + } + }] + }], + "binaries": [] + } + } + }); + let (errors, _warnings) = validate_l7_policies(&data); + assert!( + errors.is_empty(), + "valid query matcher shapes should not error: {errors:?}" + ); + } } diff --git a/crates/openshell-sandbox/src/l7/provider.rs b/crates/openshell-sandbox/src/l7/provider.rs index a9bf8bf5f..df0dfb292 100644 --- a/crates/openshell-sandbox/src/l7/provider.rs +++ b/crates/openshell-sandbox/src/l7/provider.rs @@ -10,6 +10,7 @@ //! works for both plaintext TCP and TLS-terminated connections. use miette::Result; +use std::collections::HashMap; use std::future::Future; use tokio::io::{AsyncRead, AsyncWrite}; @@ -31,6 +32,8 @@ pub struct L7Request { pub action: String, /// Target: URL path for REST, empty for SQL. pub target: String, + /// Decoded query parameter multimap for REST requests. + pub query_params: HashMap>, /// Raw request header bytes (request line + headers for HTTP, message for SQL). /// May include overflow body bytes read during header parsing. pub raw_header: Vec, diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 88f9ce8a6..e0ad2a18c 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -108,6 +108,7 @@ where let request_info = L7RequestInfo { action: req.action.clone(), target: req.target.clone(), + query_params: req.query_params.clone(), }; // Evaluate L7 policy via Rego @@ -127,6 +128,7 @@ where l7_protocol = "rest", l7_action = %request_info.action, l7_target = %request_info.target, + l7_query_params = ?request_info.query_params, l7_decision = decision_str, l7_deny_reason = %reason, "L7_REQUEST", @@ -198,6 +200,7 @@ pub fn evaluate_l7_request( "request": { "method": request.action, "path": request.target, + "query_params": request.query_params.clone(), } }); diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index f47f01bdc..da453ce16 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -10,6 +10,7 @@ use crate::l7::provider::{BodyLength, L7Provider, L7Request}; use crate::secrets::rewrite_http_header_block; use miette::{IntoDiagnostic, Result, miette}; +use std::collections::HashMap; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tracing::debug; @@ -116,7 +117,7 @@ async fn parse_http_request(client: &mut C) -> Result(client: &mut C) -> Result Result<(String, HashMap>)> { + match target.split_once('?') { + Some((path, query)) => Ok((path.to_string(), parse_query_params(query)?)), + None => Ok((target.to_string(), HashMap::new())), + } +} + +fn parse_query_params(query: &str) -> Result>> { + let mut params: HashMap> = HashMap::new(); + if query.is_empty() { + return Ok(params); + } + + for pair in query.split('&') { + if pair.is_empty() { + continue; + } + + let (raw_key, raw_value) = match pair.split_once('=') { + Some((key, value)) => (key, value), + None => (pair, ""), + }; + let key = decode_query_component(raw_key)?; + let value = decode_query_component(raw_value)?; + params.entry(key).or_default().push(value); + } + + Ok(params) +} + +/// Decode a single query string component (key or value). +/// +/// Handles both RFC 3986 percent-encoding (`%20` → space) and the +/// `application/x-www-form-urlencoded` convention (`+` → space). +/// Decoding `+` as space matches the behavior of Python's `urllib.parse`, +/// JavaScript's `URLSearchParams`, Go's `url.ParseQuery`, and most HTTP +/// frameworks. Callers that need a literal `+` should send `%2B`. +fn decode_query_component(input: &str) -> Result { + let bytes = input.as_bytes(); + let mut decoded = Vec::with_capacity(bytes.len()); + let mut i = 0; + + while i < bytes.len() { + if bytes[i] == b'+' { + decoded.push(b' '); + i += 1; + continue; + } + + if bytes[i] != b'%' { + decoded.push(bytes[i]); + i += 1; + continue; + } + + if i + 2 >= bytes.len() { + return Err(miette!("Invalid percent-encoding in query component")); + } + + let hi = decode_hex_nibble(bytes[i + 1]) + .ok_or_else(|| miette!("Invalid percent-encoding in query component"))?; + let lo = decode_hex_nibble(bytes[i + 2]) + .ok_or_else(|| miette!("Invalid percent-encoding in query component"))?; + decoded.push((hi << 4) | lo); + i += 3; + } + + String::from_utf8(decoded).map_err(|_| miette!("Query component is not valid UTF-8")) +} + +fn decode_hex_nibble(byte: u8) -> Option { + match byte { + b'0'..=b'9' => Some(byte - b'0'), + b'a'..=b'f' => Some(byte - b'a' + 10), + b'A'..=b'F' => Some(byte - b'A' + 10), + _ => None, + } +} + /// Forward an allowed HTTP request to upstream and relay the response back. /// /// Returns `true` if the upstream connection is reusable, `false` if consumed. @@ -689,6 +771,92 @@ mod tests { } } + #[test] + fn parse_target_query_parses_duplicate_values() { + let (path, query) = parse_target_query("/download?tag=a&tag=b").expect("parse"); + assert_eq!(path, "/download"); + assert_eq!( + query.get("tag").cloned(), + Some(vec!["a".into(), "b".into()]) + ); + } + + #[test] + fn parse_target_query_decodes_percent_and_plus() { + let (path, query) = parse_target_query("/download?slug=my%2Fskill&name=Foo+Bar").unwrap(); + assert_eq!(path, "/download"); + assert_eq!( + query.get("slug").cloned(), + Some(vec!["my/skill".to_string()]) + ); + // `+` is decoded as space per application/x-www-form-urlencoded. + // Literal `+` should be sent as `%2B`. + assert_eq!( + query.get("name").cloned(), + Some(vec!["Foo Bar".to_string()]) + ); + } + + #[test] + fn parse_target_query_literal_plus_via_percent_encoding() { + let (_path, query) = parse_target_query("/search?q=a%2Bb").unwrap(); + assert_eq!( + query.get("q").cloned(), + Some(vec!["a+b".to_string()]), + "%2B should decode to literal +" + ); + } + + #[test] + fn parse_target_query_empty_value() { + let (_path, query) = parse_target_query("/api?tag=").unwrap(); + assert_eq!( + query.get("tag").cloned(), + Some(vec!["".to_string()]), + "key with empty value should produce empty string" + ); + } + + #[test] + fn parse_target_query_key_without_value() { + let (_path, query) = parse_target_query("/api?verbose").unwrap(); + assert_eq!( + query.get("verbose").cloned(), + Some(vec!["".to_string()]), + "key without = should produce empty string value" + ); + } + + #[test] + fn parse_target_query_unicode_after_decoding() { + // "café" = c a f %C3%A9 + let (_path, query) = parse_target_query("/search?q=caf%C3%A9").unwrap(); + assert_eq!( + query.get("q").cloned(), + Some(vec!["café".to_string()]), + "percent-encoded UTF-8 should decode correctly" + ); + } + + #[test] + fn parse_target_query_empty_query_string() { + let (path, query) = parse_target_query("/api?").unwrap(); + assert_eq!(path, "/api"); + assert!( + query.is_empty(), + "empty query after ? should produce empty map" + ); + } + + #[test] + fn parse_target_query_rejects_malformed_percent_encoding() { + let err = parse_target_query("/download?slug=bad%2").expect_err("expected parse error"); + assert!( + err.to_string().contains("percent-encoding"), + "unexpected error: {err}" + ); + } + /// SEC-009: Reject requests with both Content-Length and Transfer-Encoding /// to prevent CL/TE request smuggling (RFC 7230 Section 3.3.3). #[test] @@ -807,6 +975,32 @@ mod tests { assert!(result.is_err(), "Must reject unsupported HTTP version"); } + #[tokio::test] + async fn parse_http_request_splits_path_and_query_params() { + let (mut client, mut writer) = tokio::io::duplex(4096); + tokio::spawn(async move { + writer + .write_all( + b"GET /download?slug=my%2Fskill&tag=foo&tag=bar HTTP/1.1\r\nHost: x\r\n\r\n", + ) + .await + .unwrap(); + }); + let req = parse_http_request(&mut client) + .await + .expect("request should parse") + .expect("request should exist"); + assert_eq!(req.target, "/download"); + assert_eq!( + req.query_params.get("slug").cloned(), + Some(vec!["my/skill".to_string()]) + ); + assert_eq!( + req.query_params.get("tag").cloned(), + Some(vec!["foo".to_string(), "bar".to_string()]) + ); + } + /// Regression test: two pipelined requests in a single write must be /// parsed independently. Before the fix, the 1024-byte `read()` buffer /// could capture bytes from the second request, which were forwarded @@ -831,6 +1025,7 @@ mod tests { .expect("expected first request"); assert_eq!(first.action, "GET"); assert_eq!(first.target, "/allowed"); + assert!(first.query_params.is_empty()); assert_eq!( first.raw_header, b"GET /allowed HTTP/1.1\r\nHost: example.com\r\n\r\n", "raw_header must contain only the first request's headers" @@ -842,6 +1037,7 @@ mod tests { .expect("expected second request"); assert_eq!(second.action, "POST"); assert_eq!(second.target, "/blocked"); + assert!(second.query_params.is_empty()); } #[test] @@ -1194,7 +1390,7 @@ mod tests { /// to the upstream API, causing 401 Unauthorized errors. #[tokio::test] async fn relay_request_with_resolver_rewrites_credential_placeholders() { - let provider_env: std::collections::HashMap = [( + let provider_env: HashMap = [( "NVIDIA_API_KEY".to_string(), "nvapi-real-secret-key".to_string(), )] @@ -1210,6 +1406,7 @@ mod tests { let req = L7Request { action: "POST".to_string(), target: "/v1/chat/completions".to_string(), + query_params: HashMap::new(), raw_header: format!( "POST /v1/chat/completions HTTP/1.1\r\n\ Host: integrate.api.nvidia.com\r\n\ @@ -1293,6 +1490,7 @@ mod tests { let req = L7Request { action: "POST".to_string(), target: "/v1/chat/completions".to_string(), + query_params: HashMap::new(), raw_header: format!( "POST /v1/chat/completions HTTP/1.1\r\n\ Host: integrate.api.nvidia.com\r\n\ diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-sandbox/src/mechanistic_mapper.rs index e5ae64977..4fe90d084 100644 --- a/crates/openshell-sandbox/src/mechanistic_mapper.rs +++ b/crates/openshell-sandbox/src/mechanistic_mapper.rs @@ -337,6 +337,7 @@ fn build_l7_rules(samples: &HashMap<(String, String), u32>) -> Vec { method: method.clone(), path: generalised, command: String::new(), + query: HashMap::new(), }), }); } diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index cd2931b35..f1df12ff4 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -667,13 +667,35 @@ fn proto_to_opa_data_json(proto: &ProtoSandboxPolicy) -> String { .iter() .map(|r| { let a = r.allow.as_ref(); - serde_json::json!({ - "allow": { - "method": a.map_or("", |a| &a.method), - "path": a.map_or("", |a| &a.path), - "command": a.map_or("", |a| &a.command), - } - }) + let mut allow = serde_json::json!({ + "method": a.map_or("", |a| &a.method), + "path": a.map_or("", |a| &a.path), + "command": a.map_or("", |a| &a.command), + }); + let query: serde_json::Map = a + .map(|allow| { + allow + .query + .iter() + .map(|(key, matcher)| { + let mut matcher_json = serde_json::json!({}); + if !matcher.glob.is_empty() { + matcher_json["glob"] = + matcher.glob.clone().into(); + } + if !matcher.any.is_empty() { + matcher_json["any"] = + matcher.any.clone().into(); + } + (key.clone(), matcher_json) + }) + .collect() + }) + .unwrap_or_default(); + if !query.is_empty() { + allow["query"] = query.into(); + } + serde_json::json!({ "allow": allow }) }) .collect(); ep["rules"] = rules.into(); @@ -714,8 +736,9 @@ mod tests { use super::*; use openshell_core::proto::{ - FilesystemPolicy as ProtoFs, NetworkBinary, NetworkEndpoint, NetworkPolicyRule, - ProcessPolicy as ProtoProc, SandboxPolicy as ProtoSandboxPolicy, + FilesystemPolicy as ProtoFs, L7Allow, L7QueryMatcher, L7Rule, NetworkBinary, + NetworkEndpoint, NetworkPolicyRule, ProcessPolicy as ProtoProc, + SandboxPolicy as ProtoSandboxPolicy, }; const TEST_POLICY: &str = include_str!("../data/sandbox-policy.rego"); @@ -1337,6 +1360,27 @@ network_policies: access: full binaries: - { path: /usr/bin/curl } + query_api: + name: query_api + endpoints: + - host: api.query.com + port: 8080 + protocol: rest + enforcement: enforce + rules: + - allow: + method: GET + path: "/download" + query: + tag: "foo-*" + - allow: + method: GET + path: "/search" + query: + tag: + any: ["foo-*", "bar-*"] + binaries: + - { path: /usr/bin/curl } l4_only: name: l4_only endpoints: @@ -1359,6 +1403,16 @@ process: } fn l7_input(host: &str, port: u16, method: &str, path: &str) -> serde_json::Value { + l7_input_with_query(host, port, method, path, serde_json::json!({})) + } + + fn l7_input_with_query( + host: &str, + port: u16, + method: &str, + path: &str, + query_params: serde_json::Value, + ) -> serde_json::Value { serde_json::json!({ "network": { "host": host, "port": port }, "exec": { @@ -1368,7 +1422,8 @@ process: }, "request": { "method": method, - "path": path + "path": path, + "query_params": query_params } }) } @@ -1472,6 +1527,140 @@ process: assert!(eval_l7(&engine, &input)); } + #[test] + fn l7_query_glob_allows_matching_duplicate_values() { + let engine = l7_engine(); + let input = l7_input_with_query( + "api.query.com", + 8080, + "GET", + "/download", + serde_json::json!({ + "tag": ["foo-a", "foo-b"], + "extra": ["ignored"], + }), + ); + assert!(eval_l7(&engine, &input)); + } + + #[test] + fn l7_query_glob_denies_on_mismatched_duplicate_value() { + let engine = l7_engine(); + let input = l7_input_with_query( + "api.query.com", + 8080, + "GET", + "/download", + serde_json::json!({ + "tag": ["foo-a", "evil"], + }), + ); + assert!(!eval_l7(&engine, &input)); + } + + #[test] + fn l7_query_any_allows_if_every_value_matches_any_pattern() { + let engine = l7_engine(); + let input = l7_input_with_query( + "api.query.com", + 8080, + "GET", + "/search", + serde_json::json!({ + "tag": ["foo-a", "bar-b"], + }), + ); + assert!(eval_l7(&engine, &input)); + } + + #[test] + fn l7_query_missing_required_key_denied() { + let engine = l7_engine(); + let input = l7_input_with_query( + "api.query.com", + 8080, + "GET", + "/download", + serde_json::json!({}), + ); + assert!(!eval_l7(&engine, &input)); + } + + #[test] + fn l7_query_rules_from_proto_are_enforced() { + let mut query = std::collections::HashMap::new(); + query.insert( + "tag".to_string(), + L7QueryMatcher { + glob: "foo-*".to_string(), + any: vec![], + }, + ); + + let mut network_policies = std::collections::HashMap::new(); + network_policies.insert( + "query_proto".to_string(), + NetworkPolicyRule { + name: "query_proto".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.proto.com".to_string(), + port: 8080, + protocol: "rest".to_string(), + enforcement: "enforce".to_string(), + rules: vec![L7Rule { + allow: Some(L7Allow { + method: "GET".to_string(), + path: "/download".to_string(), + command: String::new(), + query, + }), + }], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + + let proto = ProtoSandboxPolicy { + version: 1, + filesystem: Some(ProtoFs { + include_workdir: true, + read_only: vec![], + read_write: vec![], + }), + landlock: Some(openshell_core::proto::LandlockPolicy { + compatibility: "best_effort".to_string(), + }), + process: Some(ProtoProc { + run_as_user: "sandbox".to_string(), + run_as_group: "sandbox".to_string(), + }), + network_policies, + }; + + let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); + let allow_input = l7_input_with_query( + "api.proto.com", + 8080, + "GET", + "/download", + serde_json::json!({ "tag": ["foo-a"] }), + ); + assert!(eval_l7(&engine, &allow_input)); + + let deny_input = l7_input_with_query( + "api.proto.com", + 8080, + "GET", + "/download", + serde_json::json!({ "tag": ["evil"] }), + ); + assert!(!eval_l7(&engine, &deny_input)); + } + #[test] fn l7_no_request_on_l4_only_endpoint() { // L4-only endpoint should not match L7 allow_request diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 088ec46a6..008877257 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1838,9 +1838,12 @@ async fn handle_forward_proxy( secret_resolver: secret_resolver.clone(), }; + let (target_path, query_params) = crate::l7::rest::parse_target_query(&path) + .unwrap_or_else(|_| (path.clone(), std::collections::HashMap::new())); let request_info = crate::l7::L7RequestInfo { action: method.to_string(), - target: path.clone(), + target: target_path, + query_params, }; let (allowed, reason) = diff --git a/docs/reference/policy-schema.md b/docs/reference/policy-schema.md index 6916e8d0c..7ad317f36 100644 --- a/docs/reference/policy-schema.md +++ b/docs/reference/policy-schema.md @@ -196,6 +196,7 @@ Used when `access` is not set. Each rule explicitly allows a method and path com |---|---|---|---| | `allow.method` | string | Yes | HTTP method to allow (for example, `GET`, `POST`). | | `allow.path` | string | Yes | URL path pattern. Supports `*` and `**` glob syntax. | +| `allow.query` | map | No | Query parameter matchers keyed by decoded param name. Matcher value can be a glob string (`tag: "foo-*"`) or an object with `any` (`tag: { any: ["foo-*", "bar-*"] }`). | Example with rules: @@ -204,9 +205,14 @@ rules: - allow: method: GET path: /**/info/refs* + query: + service: "git-*" - allow: method: POST path: /**/git-upload-pack + query: + tag: + any: ["v1.*", "v2.*"] ``` ### Binary Object diff --git a/docs/sandboxes/policies.md b/docs/sandboxes/policies.md index fa5ed5d8b..3ec33af9e 100644 --- a/docs/sandboxes/policies.md +++ b/docs/sandboxes/policies.md @@ -269,6 +269,32 @@ Endpoints with `protocol: rest` enable HTTP request inspection. The proxy auto-d ::::: +### Query parameter matching + +REST rules can also constrain query parameter values: + +```yaml + download_api: + name: download_api + endpoints: + - host: api.example.com + port: 443 + protocol: rest + enforcement: enforce + rules: + - allow: + method: GET + path: "/api/v1/download" + query: + slug: "skill-*" + version: + any: ["1.*", "2.*"] + binaries: + - { path: /usr/bin/curl } +``` + +`query` matchers are case-sensitive and run on decoded values. If a request has duplicate keys (for example, `tag=a&tag=b`), every value for that key must match the configured glob(s). + ## Next Steps Explore related topics: diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index 1615bef6c..6a4bf5ed2 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -271,6 +271,90 @@ def fn(proxy_host, proxy_port, target_url): return fn +def _proxy_connect_then_http_with_server(): + """Return a closure that starts a local HTTP server and sends CONNECT+HTTP.""" + + def fn(proxy_host, proxy_port, target_host, target_port, method="GET", path="/"): + import json as _json + import socket + import threading + import time + from http.server import BaseHTTPRequestHandler, HTTPServer + + class Handler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + body = b"connect-server-ok" + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_POST(self): + self.send_response(200) + body = b"connect-server-ok" + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, *args): + pass + + srv = HTTPServer(("0.0.0.0", int(target_port)), Handler) + threading.Thread(target=srv.handle_request, daemon=True).start() + time.sleep(0.5) + + conn = socket.create_connection((proxy_host, int(proxy_port)), timeout=10) + try: + conn.sendall( + f"CONNECT {target_host}:{target_port} HTTP/1.1\r\nHost: {target_host}\r\n\r\n".encode() + ) + connect_resp = conn.recv(256).decode("latin1") + if "200" not in connect_resp: + return _json.dumps( + {"connect_status": connect_resp.strip(), "http_status": 0} + ) + + request = ( + f"{method} {path} HTTP/1.1\r\nHost: {target_host}\r\nConnection: close\r\n\r\n" + ) + conn.sendall(request.encode()) + + data = b"" + conn.settimeout(5) + try: + while True: + chunk = conn.recv(4096) + if not chunk: + break + data += chunk + except socket.timeout: + pass + + response = data.decode("latin1", errors="replace") + status_line = response.split("\r\n")[0] if response else "" + status_code = ( + int(status_line.split()[1]) if len(status_line.split()) >= 2 else 0 + ) + + header_end = response.find("\r\n\r\n") + headers_raw = response[:header_end] if header_end > 0 else "" + body = response[header_end + 4 :] if header_end > 0 else "" + + return _json.dumps( + { + "connect_status": connect_resp.strip(), + "http_status": status_code, + "headers": headers_raw, + "body": body, + } + ) + finally: + conn.close() + srv.server_close() + + return fn + + def test_policy_applies_to_exec_commands( sandbox: Callable[..., Sandbox], ) -> None: @@ -796,6 +880,8 @@ def test_ssrf_loopback_blocked_even_with_allowed_ips( # L7-T6: L7 deny response is valid JSON with expected fields # L7-T7: L7 request logging includes structured fields # L7-T8: Port 443 + protocol=rest without tls=terminate warns (L7 not evaluated) +# L7-T9: Query matcher glob/any allows and denies as expected +# L7-T10: Rule without query matcher allows any query params # ============================================================================= @@ -1102,6 +1188,166 @@ def test_l7_tls_log_fields( assert "l7_decision" in log +def test_l7_query_matchers_enforced( + sandbox: Callable[..., Sandbox], +) -> None: + """L7-T9: Query matcher glob/any allows and denies as expected.""" + policy = _base_policy( + network_policies={ + "query_api": sandbox_pb2.NetworkPolicyRule( + name="query_api", + endpoints=[ + sandbox_pb2.NetworkEndpoint( + host=_SANDBOX_IP, + port=_FORWARD_PROXY_PORT, + protocol="rest", + enforcement="enforce", + allowed_ips=["10.200.0.0/24"], + rules=[ + sandbox_pb2.L7Rule( + allow=sandbox_pb2.L7Allow( + method="GET", + path="/download", + query={ + "tag": sandbox_pb2.L7QueryMatcher(glob="foo-*"), + }, + ), + ), + sandbox_pb2.L7Rule( + allow=sandbox_pb2.L7Allow( + method="GET", + path="/search", + query={ + "tag": sandbox_pb2.L7QueryMatcher( + any=["foo-*", "bar-*"] + ), + }, + ), + ), + ], + ), + ], + binaries=[sandbox_pb2.NetworkBinary(path="/**")], + ), + }, + ) + spec = datamodel_pb2.SandboxSpec(policy=policy) + with sandbox(spec=spec, delete_on_exit=True) as sb: + allowed = sb.exec_python( + _proxy_connect_then_http_with_server(), + args=( + _PROXY_HOST, + _PROXY_PORT, + _SANDBOX_IP, + _FORWARD_PROXY_PORT, + "GET", + "/download?tag=foo-a&tag=foo-b", + ), + ) + assert allowed.exit_code == 0, allowed.stderr + allowed_resp = json.loads(allowed.stdout) + assert "200" in allowed_resp["connect_status"] + assert allowed_resp["http_status"] == 200 + assert "connect-server-ok" in allowed_resp["body"] + + denied = sb.exec_python( + _proxy_connect_then_http_with_server(), + args=( + _PROXY_HOST, + _PROXY_PORT, + _SANDBOX_IP, + _FORWARD_PROXY_PORT, + "GET", + "/download?tag=foo-a&tag=evil", + ), + ) + assert denied.exit_code == 0, denied.stderr + denied_resp = json.loads(denied.stdout) + assert denied_resp["http_status"] == 403 + assert "policy_denied" in denied_resp["body"] + + any_allowed = sb.exec_python( + _proxy_connect_then_http_with_server(), + args=( + _PROXY_HOST, + _PROXY_PORT, + _SANDBOX_IP, + _FORWARD_PROXY_PORT, + "GET", + "/search?tag=foo-a&tag=bar-b", + ), + ) + assert any_allowed.exit_code == 0, any_allowed.stderr + any_resp = json.loads(any_allowed.stdout) + assert any_resp["http_status"] == 200 + assert "connect-server-ok" in any_resp["body"] + + missing_required = sb.exec_python( + _proxy_connect_then_http_with_server(), + args=( + _PROXY_HOST, + _PROXY_PORT, + _SANDBOX_IP, + _FORWARD_PROXY_PORT, + "GET", + "/download?slug=skill-1", + ), + ) + assert missing_required.exit_code == 0, missing_required.stderr + missing_resp = json.loads(missing_required.stdout) + assert missing_resp["http_status"] == 403 + assert "policy_denied" in missing_resp["body"] + + +def test_l7_rule_without_query_matcher_allows_any_query_params( + sandbox: Callable[..., Sandbox], +) -> None: + """L7-T10: Rule without query matcher allows any query params.""" + policy = _base_policy( + network_policies={ + "query_optional": sandbox_pb2.NetworkPolicyRule( + name="query_optional", + endpoints=[ + sandbox_pb2.NetworkEndpoint( + host=_SANDBOX_IP, + port=_FORWARD_PROXY_PORT, + protocol="rest", + enforcement="enforce", + allowed_ips=["10.200.0.0/24"], + rules=[ + sandbox_pb2.L7Rule( + allow=sandbox_pb2.L7Allow( + method="GET", + path="/download", + ), + ), + ], + ), + ], + binaries=[sandbox_pb2.NetworkBinary(path="/**")], + ), + }, + ) + spec = datamodel_pb2.SandboxSpec(policy=policy) + with sandbox(spec=spec, delete_on_exit=True) as sb: + result = sb.exec_python( + _proxy_connect_then_http_with_server(), + args=( + _PROXY_HOST, + _PROXY_PORT, + _SANDBOX_IP, + _FORWARD_PROXY_PORT, + "GET", + "/download?tag=anything&slug=any-value", + ), + ) + assert result.exit_code == 0, result.stderr + resp = json.loads(result.stdout) + assert "200" in resp["connect_status"] + assert resp["http_status"] == 200 + assert "connect-server-ok" in resp["body"] + + # ============================================================================= # Live policy update + log streaming tests # diff --git a/proto/sandbox.proto b/proto/sandbox.proto index a96ca33fd..61948a527 100644 --- a/proto/sandbox.proto +++ b/proto/sandbox.proto @@ -100,6 +100,18 @@ message L7Allow { string path = 2; // SQL command (SQL): SELECT, INSERT, etc. or "*" for any. string command = 3; + // Query parameter matcher map (REST). + // Key is the decoded query parameter name (case-sensitive). + // Value supports either a single glob (`glob`) or a list (`any`). + map query = 4; +} + +// Query value matcher for one query parameter key. +message L7QueryMatcher { + // Single glob pattern. + string glob = 1; + // Any-of glob patterns. + repeated string any = 2; } // A binary identity for network policy matching. From 0815f82958002bfdef89e0c0cda1cf95ac847f80 Mon Sep 17 00:00:00 2001 From: Rafael Marcelino Koike Date: Mon, 30 Mar 2026 17:29:00 -0400 Subject: [PATCH 15/45] perf(sandbox): streaming SHA256 and spawn_blocking for identity resolution (#555) * perf(sandbox): streaming SHA256, spawn_blocking for identity resolution Key changes: - Replace full file read + SHA256 with streaming 64KB-buffered hash (saves 124MB allocation for node binary) - Wrap evaluate_opa_tcp in spawn_blocking to prevent blocking tokio runtime during heavy /proc I/O and SHA256 computation - Add file-based perf logging for profiling proxy latency phases Profiling data (node binary, 124MB): - Cold TOFU: ~890ms (read+hash), warm: 0ms (cache hit) - evaluate_opa_tcp: cold=1002ms, warm=11ms - OPA evaluation: 1ms - DNS+TCP connect: 166-437ms Made-with: Cursor Signed-off-by: Rafael Koike * refactor(sandbox): replace perf_log with tracing::debug Replace the custom file-based perf_log() helper with standard tracing::debug!() macros as requested in PR review. This removes the custom log file writes to /var/log/openshell-perf.log and routes all performance timing through the tracing framework at DEBUG level, consistent with the rest of the codebase. Made-with: Cursor * refactor(sandbox): reduce tracing to 6 key diagnostic logs Address PR review feedback: 1. Remove ~20 inner-phase timing logs, keeping only the 6 that tell the full diagnostic story: - evaluate_opa_tcp TOTAL (proxy.rs) - dns_resolve_and_tcp_connect (proxy.rs) - file_sha256 (procfs.rs) - verify_or_cache CACHE HIT / CACHE MISS / TOTAL cold (identity.rs) 2. Restore intent-describing comments that were replaced by timing logs: - "TOFU verify the immediate binary" (proxy.rs) - "Walk the process tree upward to collect ancestor binaries" (proxy.rs) - "Collect cmdline paths for script-based binary detection." (proxy.rs) - "First: scan descendants of the entrypoint process" (procfs.rs) - "Fallback: scan all of /proc in case the process isn't in the tree" (procfs.rs) - "Skip PIDs we already checked" (procfs.rs) 3. Preserve file path in file_sha256 read errors instead of losing context via into_diagnostic(). 4. Tests: 293 passed, 1 pre-existing failure (drop_privileges), 1 ignored. Made-with: Cursor * style(sandbox): apply rustfmt formatting to debug macros --------- Signed-off-by: Rafael Koike Co-authored-by: John Myers --- crates/openshell-sandbox/src/identity.rs | 19 +++++++ crates/openshell-sandbox/src/procfs.rs | 34 ++++++++++-- crates/openshell-sandbox/src/proxy.rs | 71 +++++++++++++++++------- 3 files changed, 98 insertions(+), 26 deletions(-) diff --git a/crates/openshell-sandbox/src/identity.rs b/crates/openshell-sandbox/src/identity.rs index d27976ba7..49809f95b 100644 --- a/crates/openshell-sandbox/src/identity.rs +++ b/crates/openshell-sandbox/src/identity.rs @@ -16,6 +16,7 @@ use std::fs::Metadata; use std::os::unix::fs::MetadataExt; use std::path::{Path, PathBuf}; use std::sync::Mutex; +use tracing::debug; #[derive(Clone)] struct FileFingerprint { @@ -100,6 +101,7 @@ impl BinaryIdentityCache { where F: FnMut(&Path) -> Result, { + let start = std::time::Instant::now(); let metadata = std::fs::metadata(path) .map_err(|error| miette::miette!("Failed to stat {}: {error}", path.display()))?; let fingerprint = FileFingerprint::from_metadata(&metadata); @@ -114,9 +116,20 @@ impl BinaryIdentityCache { if let Some(cached_binary) = &cached && cached_binary.fingerprint == fingerprint { + debug!( + " verify_or_cache: {}ms CACHE HIT path={}", + start.elapsed().as_millis(), + path.display() + ); return Ok(cached_binary.hash.clone()); } + debug!( + " verify_or_cache: CACHE MISS size={} path={}", + metadata.len(), + path.display() + ); + let current_hash = hash_file(path)?; let mut hashes = self @@ -143,6 +156,12 @@ impl BinaryIdentityCache { }, ); + debug!( + " verify_or_cache TOTAL (cold): {}ms path={}", + start.elapsed().as_millis(), + path.display() + ); + Ok(current_hash) } } diff --git a/crates/openshell-sandbox/src/procfs.rs b/crates/openshell-sandbox/src/procfs.rs index ece16c82a..785a9489e 100644 --- a/crates/openshell-sandbox/src/procfs.rs +++ b/crates/openshell-sandbox/src/procfs.rs @@ -6,10 +6,11 @@ //! Provides functions to resolve binary paths and compute file hashes //! for process-identity binding in the OPA proxy policy engine. -use miette::{IntoDiagnostic, Result}; +use miette::Result; use std::path::Path; #[cfg(target_os = "linux")] use std::path::PathBuf; +use tracing::debug; /// Read the binary path of a process via `/proc/{pid}/exe` symlink. /// @@ -229,8 +230,9 @@ fn parse_proc_net_tcp(pid: u32, peer_port: u16) -> Result { fn find_pid_by_socket_inode(inode: u64, entrypoint_pid: u32) -> Result { let target = format!("socket:[{inode}]"); - // First: scan descendants of the entrypoint process (targeted, most likely to succeed) + // First: scan descendants of the entrypoint process let descendants = collect_descendant_pids(entrypoint_pid); + for &pid in &descendants { if let Some(found) = check_pid_fds(pid, &target) { return Ok(found); @@ -238,7 +240,6 @@ fn find_pid_by_socket_inode(inode: u64, entrypoint_pid: u32) -> Result { } // Fallback: scan all of /proc in case the process isn't in the tree - // (e.g., if /proc//task//children wasn't available) if let Ok(proc_dir) = std::fs::read_dir("/proc") { for entry in proc_dir.flatten() { let name = entry.file_name(); @@ -318,9 +319,32 @@ fn collect_descendant_pids(root_pid: u32) -> Vec { /// same hash, or the request is denied. pub fn file_sha256(path: &Path) -> Result { use sha2::{Digest, Sha256}; + use std::io::Read; + + let start = std::time::Instant::now(); + let mut file = std::fs::File::open(path) + .map_err(|e| miette::miette!("Failed to open {}: {e}", path.display()))?; + let mut hasher = Sha256::new(); + let mut buf = [0u8; 65536]; + let mut total_read = 0u64; + loop { + let n = file + .read(&mut buf) + .map_err(|e| miette::miette!("Failed to read {}: {e}", path.display()))?; + if n == 0 { + break; + } + total_read += n as u64; + hasher.update(&buf[..n]); + } - let bytes = std::fs::read(path).into_diagnostic()?; - let hash = Sha256::digest(&bytes); + let hash = hasher.finalize(); + debug!( + " file_sha256: {}ms size={} path={}", + start.elapsed().as_millis(), + total_read, + path.display() + ); Ok(hex::encode(hash)) } diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 008877257..49fe5e07b 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -336,15 +336,25 @@ async fn handle_tcp_connection( let peer_addr = client.peer_addr().into_diagnostic()?; let local_addr = client.local_addr().into_diagnostic()?; - // Evaluate OPA policy with process-identity binding - let decision = evaluate_opa_tcp( - peer_addr, - &opa_engine, - &identity_cache, - &entrypoint_pid, - &host_lc, - port, - ); + // Evaluate OPA policy with process-identity binding. + // Wrapped in spawn_blocking because identity resolution does heavy sync I/O: + // /proc scanning + SHA256 hashing of binaries (e.g. node at 124MB). + let opa_clone = opa_engine.clone(); + let cache_clone = identity_cache.clone(); + let pid_clone = entrypoint_pid.clone(); + let host_clone = host_lc.clone(); + let decision = tokio::task::spawn_blocking(move || { + evaluate_opa_tcp( + peer_addr, + &opa_clone, + &cache_clone, + &pid_clone, + &host_clone, + port, + ) + }) + .await + .map_err(|e| miette::miette!("identity resolution task panicked: {e}"))?; // Extract action string and matched policy for logging let (matched_policy, deny_reason) = match &decision.action { @@ -426,6 +436,7 @@ async fn handle_tcp_connection( } // Defense-in-depth: resolve DNS and reject connections to internal IPs. + let dns_connect_start = std::time::Instant::now(); let mut upstream = if !raw_allowed_ips.is_empty() { // allowed_ips mode: validate resolved IPs against CIDR allowlist. // Loopback and link-local are still always blocked. @@ -502,6 +513,11 @@ async fn handle_tcp_connection( } }; + debug!( + "handle_tcp_connection dns_resolve_and_tcp_connect: {}ms host={host_lc}", + dns_connect_start.elapsed().as_millis() + ); + respond(&mut client, b"HTTP/1.1 200 Connection Established\r\n\r\n").await?; // Check if endpoint has L7 config for protocol-aware inspection @@ -736,7 +752,9 @@ fn evaluate_opa_tcp( ); } + let total_start = std::time::Instant::now(); let peer_port = peer_addr.port(); + let (bin_path, binary_pid) = match crate::procfs::resolve_tcp_peer_identity(pid, peer_port) { Ok(r) => r, Err(e) => { @@ -767,7 +785,6 @@ fn evaluate_opa_tcp( // Walk the process tree upward to collect ancestor binaries let ancestors = crate::procfs::collect_ancestor_binaries(binary_pid, pid); - // TOFU verify each ancestor binary for ancestor in &ancestors { if let Err(e) = identity_cache.verify_or_cache(ancestor) { return deny( @@ -784,7 +801,6 @@ fn evaluate_opa_tcp( } // Collect cmdline paths for script-based binary detection. - // Excludes exe paths already captured in bin_path/ancestors to avoid duplicates. let mut exclude = ancestors.clone(); exclude.push(bin_path.clone()); let cmdline_paths = crate::procfs::collect_cmdline_paths(binary_pid, pid, &exclude); @@ -798,7 +814,7 @@ fn evaluate_opa_tcp( cmdline_paths: cmdline_paths.clone(), }; - match engine.evaluate_network_action(&input) { + let result = match engine.evaluate_network_action(&input) { Ok(action) => ConnectDecision { action, binary: Some(bin_path), @@ -813,7 +829,12 @@ fn evaluate_opa_tcp( ancestors, cmdline_paths, ), - } + }; + debug!( + "evaluate_opa_tcp TOTAL: {}ms host={host} port={port}", + total_start.elapsed().as_millis() + ); + result } /// Non-Linux stub: OPA identity binding requires /proc. @@ -1728,14 +1749,22 @@ async fn handle_forward_proxy( let peer_addr = client.peer_addr().into_diagnostic()?; let local_addr = client.local_addr().into_diagnostic()?; - let decision = evaluate_opa_tcp( - peer_addr, - &opa_engine, - &identity_cache, - &entrypoint_pid, - &host_lc, - port, - ); + let opa_clone = opa_engine.clone(); + let cache_clone = identity_cache.clone(); + let pid_clone = entrypoint_pid.clone(); + let host_clone = host_lc.clone(); + let decision = tokio::task::spawn_blocking(move || { + evaluate_opa_tcp( + peer_addr, + &opa_clone, + &cache_clone, + &pid_clone, + &host_clone, + port, + ) + }) + .await + .map_err(|e| miette::miette!("identity resolution task panicked: {e}"))?; // Build log context let binary_str = decision From 36329a1059f5fe0c182c1656c4f73b9b190a2ec6 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 31 Mar 2026 00:02:30 +0200 Subject: [PATCH 16/45] feat(inference): allow setting custom inference timeout (#672) * feat(inference): add timeout * feat(inference): fix dynamic timeout change * feat(inference): update docs * feat(inference): fix formatting --- architecture/inference-routing.md | 23 ++++++++++------- crates/openshell-cli/src/main.rs | 13 +++++++++- crates/openshell-cli/src/run.rs | 21 ++++++++++++++-- crates/openshell-router/src/backend.rs | 3 ++- crates/openshell-router/src/config.rs | 8 ++++++ crates/openshell-router/src/lib.rs | 3 --- crates/openshell-router/src/mock.rs | 1 + .../tests/backend_integration.rs | 6 +++++ crates/openshell-sandbox/src/lib.rs | 22 ++++++++++++++++ .../tests/system_inference.rs | 3 +++ crates/openshell-server/src/inference.rs | 25 +++++++++++++++++-- docs/inference/configure.md | 23 +++++++++++++++-- proto/inference.proto | 10 ++++++++ 13 files changed, 141 insertions(+), 20 deletions(-) diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md index 0d3a95afb..9d45d7cd9 100644 --- a/architecture/inference-routing.md +++ b/architecture/inference-routing.md @@ -92,10 +92,10 @@ File: `proto/inference.proto` Key messages: -- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `no_verify` override, with verification enabled by default -- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version` +- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + `timeout_secs` + optional `no_verify` override, with verification enabled by default +- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `timeout_secs` + `version` - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms` -- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type` +- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`, `timeout_secs` ## Data Plane (Sandbox) @@ -106,7 +106,7 @@ Files: - `crates/openshell-sandbox/src/lib.rs` -- inference context initialization, route refresh - `crates/openshell-sandbox/src/grpc_client.rs` -- `fetch_inference_bundle()` -In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes. +In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes. The revision hash covers all route fields including `timeout_secs`, so any configuration change (provider, model, or timeout) triggers a cache update on the next poll. ### Interception flow @@ -143,7 +143,7 @@ If no pattern matches, the proxy returns `403 Forbidden` with `{"error": "connec ### Route cache - `InferenceContext` holds a `Router`, the pattern list, and an `Arc>>` route cache. -- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 30 seconds (`ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept. +- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 5 seconds (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept. - In file mode (`--inference-routes`), routes load once at startup from YAML. No refresh task is spawned. - In cluster mode, an empty initial bundle still enables the inference context so the refresh task can pick up later configuration. @@ -209,9 +209,11 @@ File: `crates/openshell-router/src/mock.rs` Routes with `mock://` scheme endpoints return canned responses without making HTTP requests. Mock responses are protocol-aware (OpenAI chat completion, OpenAI completion, Anthropic messages, or generic JSON). Mock routes include an `x-openshell-mock: true` response header. -### HTTP client +### Per-request timeout -The router uses a `reqwest::Client` with a 60-second timeout. Timeouts and connection failures map to `RouterError::UpstreamUnavailable`. +Each `ResolvedRoute` carries a `timeout` field (`Duration`). The `reqwest::Client` has no global timeout; instead, each outgoing request applies `.timeout(route.timeout)` on the request builder. When `timeout_secs` is `0` in the proto message, the default of 60 seconds is used (defined as `DEFAULT_ROUTE_TIMEOUT` in `config.rs`). Timeouts and connection failures map to `RouterError::UpstreamUnavailable`. + +Timeout changes propagate dynamically to running sandboxes. The bundle revision hash includes `timeout_secs`, so when the timeout is updated via `openshell inference update --timeout`, the refresh loop detects the revision change and updates the route cache within one polling interval (5 seconds by default). ## Standalone Route File @@ -297,13 +299,16 @@ The system route is stored as a separate `InferenceRoute` record in the gateway Cluster inference commands: -- `openshell inference set --provider --model ` -- configures user-facing cluster inference -- `openshell inference set --system --provider --model ` -- configures system inference +- `openshell inference set --provider --model [--timeout ]` -- configures user-facing cluster inference +- `openshell inference set --system --provider --model [--timeout ]` -- configures system inference +- `openshell inference update [--provider ] [--model ] [--timeout ]` -- updates individual fields without resetting others - `openshell inference get` -- displays both user and system inference configuration - `openshell inference get --system` -- displays only the system inference configuration The `--provider` flag references a provider record name (not a provider type). The provider must already exist in the cluster and have a supported inference type (`openai`, `anthropic`, or `nvidia`). +The `--timeout` flag sets the per-request timeout in seconds for upstream inference calls. When omitted or set to `0`, the default of 60 seconds applies. Timeout changes propagate to running sandboxes within the route refresh interval (5 seconds by default). + Inference writes verify by default. `--no-verify` is the explicit opt-out for endpoints that are not up yet. ## Provider Discovery diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 5de31c79c..df37410b6 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -937,6 +937,10 @@ enum InferenceCommands { /// Skip endpoint verification before saving the route. #[arg(long)] no_verify: bool, + + /// Request timeout in seconds for inference calls (0 = default 60s). + #[arg(long, default_value_t = 0)] + timeout: u64, }, /// Update gateway-level inference configuration (partial update). @@ -957,6 +961,10 @@ enum InferenceCommands { /// Skip endpoint verification before saving the route. #[arg(long)] no_verify: bool, + + /// Request timeout in seconds for inference calls (0 = default 60s, unchanged if omitted). + #[arg(long)] + timeout: Option, }, /// Get gateway-level inference provider and model. @@ -2026,10 +2034,11 @@ async fn main() -> Result<()> { model, system, no_verify, + timeout, } => { let route_name = if system { "sandbox-system" } else { "" }; run::gateway_inference_set( - endpoint, &provider, &model, route_name, no_verify, &tls, + endpoint, &provider, &model, route_name, no_verify, timeout, &tls, ) .await?; } @@ -2038,6 +2047,7 @@ async fn main() -> Result<()> { model, system, no_verify, + timeout, } => { let route_name = if system { "sandbox-system" } else { "" }; run::gateway_inference_update( @@ -2046,6 +2056,7 @@ async fn main() -> Result<()> { model.as_deref(), route_name, no_verify, + timeout, &tls, ) .await?; diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index e32eec2a4..bab819137 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -3481,6 +3481,7 @@ pub async fn gateway_inference_set( model_id: &str, route_name: &str, no_verify: bool, + timeout_secs: u64, tls: &TlsOptions, ) -> Result<()> { let progress = if std::io::stdout().is_terminal() { @@ -3504,6 +3505,7 @@ pub async fn gateway_inference_set( route_name: route_name.to_string(), verify: false, no_verify, + timeout_secs, }) .await; @@ -3525,6 +3527,7 @@ pub async fn gateway_inference_set( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); if configured.validation_performed { println!(" {}", "Validated Endpoints:".dimmed()); for endpoint in configured.validated_endpoints { @@ -3540,11 +3543,12 @@ pub async fn gateway_inference_update( model_id: Option<&str>, route_name: &str, no_verify: bool, + timeout_secs: Option, tls: &TlsOptions, ) -> Result<()> { - if provider_name.is_none() && model_id.is_none() { + if provider_name.is_none() && model_id.is_none() && timeout_secs.is_none() { return Err(miette::miette!( - "at least one of --provider or --model must be specified" + "at least one of --provider, --model, or --timeout must be specified" )); } @@ -3561,6 +3565,7 @@ pub async fn gateway_inference_update( let provider = provider_name.unwrap_or(¤t.provider_name); let model = model_id.unwrap_or(¤t.model_id); + let timeout = timeout_secs.unwrap_or(current.timeout_secs); let progress = if std::io::stdout().is_terminal() { let spinner = ProgressBar::new_spinner(); @@ -3582,6 +3587,7 @@ pub async fn gateway_inference_update( route_name: route_name.to_string(), verify: false, no_verify, + timeout_secs: timeout, }) .await; @@ -3603,6 +3609,7 @@ pub async fn gateway_inference_update( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); if configured.validation_performed { println!(" {}", "Validated Endpoints:".dimmed()); for endpoint in configured.validated_endpoints { @@ -3639,6 +3646,7 @@ pub async fn gateway_inference_get( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); } else { // Show both routes by default. print_inference_route(&mut client, "Gateway inference", "").await; @@ -3666,6 +3674,7 @@ async fn print_inference_route( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); } Err(e) if e.code() == Code::NotFound => { println!("{}", format!("{label}:").cyan().bold()); @@ -3680,6 +3689,14 @@ async fn print_inference_route( } } +fn print_timeout(timeout_secs: u64) { + if timeout_secs == 0 { + println!(" {} {}s (default)", "Timeout:".dimmed(), 60); + } else { + println!(" {} {}s", "Timeout:".dimmed(), timeout_secs); + } +} + fn format_inference_status(status: Status) -> miette::Report { let message = status.message().trim(); diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs index d82ea082c..d1d7092c0 100644 --- a/crates/openshell-router/src/backend.rs +++ b/crates/openshell-router/src/backend.rs @@ -149,7 +149,7 @@ async fn send_backend_request( } Err(_) => body, }; - builder = builder.body(body); + builder = builder.body(body).timeout(route.timeout); builder.send().await.map_err(|e| { if e.is_timeout() { @@ -468,6 +468,7 @@ mod tests { protocols: protocols.iter().map(|p| (*p).to_string()).collect(), auth, default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: crate::config::DEFAULT_ROUTE_TIMEOUT, } } diff --git a/crates/openshell-router/src/config.rs b/crates/openshell-router/src/config.rs index d9c081d60..52c22da9f 100644 --- a/crates/openshell-router/src/config.rs +++ b/crates/openshell-router/src/config.rs @@ -3,11 +3,14 @@ use serde::Deserialize; use std::path::Path; +use std::time::Duration; pub use openshell_core::inference::AuthHeader; use crate::RouterError; +pub const DEFAULT_ROUTE_TIMEOUT: Duration = Duration::from_secs(60); + #[derive(Debug, Clone, Deserialize)] pub struct RouterConfig { pub routes: Vec, @@ -45,6 +48,8 @@ pub struct ResolvedRoute { pub auth: AuthHeader, /// Extra headers injected on every request (e.g. `anthropic-version`). pub default_headers: Vec<(String, String)>, + /// Per-request timeout for proxied inference calls. + pub timeout: Duration, } impl std::fmt::Debug for ResolvedRoute { @@ -57,6 +62,7 @@ impl std::fmt::Debug for ResolvedRoute { .field("protocols", &self.protocols) .field("auth", &self.auth) .field("default_headers", &self.default_headers) + .field("timeout", &self.timeout) .finish() } } @@ -129,6 +135,7 @@ impl RouteConfig { protocols, auth, default_headers, + timeout: DEFAULT_ROUTE_TIMEOUT, }) } } @@ -256,6 +263,7 @@ routes: protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: DEFAULT_ROUTE_TIMEOUT, }; let debug_output = format!("{route:?}"); assert!( diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs index a5712d9a0..7deed6fc4 100644 --- a/crates/openshell-router/src/lib.rs +++ b/crates/openshell-router/src/lib.rs @@ -5,8 +5,6 @@ mod backend; pub mod config; mod mock; -use std::time::Duration; - pub use backend::{ ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, ValidationFailure, ValidationFailureKind, verify_backend_endpoint, @@ -39,7 +37,6 @@ pub struct Router { impl Router { pub fn new() -> Result { let client = reqwest::Client::builder() - .timeout(Duration::from_secs(60)) .build() .map_err(|e| RouterError::Internal(format!("failed to build HTTP client: {e}")))?; Ok(Self { diff --git a/crates/openshell-router/src/mock.rs b/crates/openshell-router/src/mock.rs index 9b6accb60..a17ce486f 100644 --- a/crates/openshell-router/src/mock.rs +++ b/crates/openshell-router/src/mock.rs @@ -131,6 +131,7 @@ mod tests { protocols: protocols.iter().map(ToString::to_string).collect(), auth: crate::config::AuthHeader::Bearer, default_headers: Vec::new(), + timeout: crate::config::DEFAULT_ROUTE_TIMEOUT, } } diff --git a/crates/openshell-router/tests/backend_integration.rs b/crates/openshell-router/tests/backend_integration.rs index 4861bd6d0..571964aa8 100644 --- a/crates/openshell-router/tests/backend_integration.rs +++ b/crates/openshell-router/tests/backend_integration.rs @@ -15,6 +15,7 @@ fn mock_candidates(base_url: &str) -> Vec { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }] } @@ -117,6 +118,7 @@ async fn proxy_no_compatible_route_returns_error() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let err = router @@ -178,6 +180,7 @@ async fn proxy_mock_route_returns_canned_response() { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ @@ -312,6 +315,7 @@ async fn proxy_uses_x_api_key_for_anthropic_route() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ @@ -370,6 +374,7 @@ async fn proxy_anthropic_does_not_send_bearer_auth() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let response = router @@ -414,6 +419,7 @@ async fn proxy_forwards_client_anthropic_version_header() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 297d7fc38..149632446 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -801,6 +801,11 @@ pub(crate) fn bundle_to_resolved_routes( .map(|r| { let (auth, default_headers) = openshell_core::inference::auth_for_provider_type(&r.provider_type); + let timeout = if r.timeout_secs == 0 { + openshell_router::config::DEFAULT_ROUTE_TIMEOUT + } else { + Duration::from_secs(r.timeout_secs) + }; openshell_router::config::ResolvedRoute { name: r.name.clone(), endpoint: r.base_url.clone(), @@ -809,6 +814,7 @@ pub(crate) fn bundle_to_resolved_routes( protocols: r.protocols.clone(), auth, default_headers, + timeout, } }) .collect() @@ -1517,6 +1523,7 @@ mod tests { "openai_responses".to_string(), ], provider_type: "openai".to_string(), + timeout_secs: 0, }, openshell_core::proto::ResolvedRoute { name: "local".to_string(), @@ -1525,6 +1532,7 @@ mod tests { model_id: "llama-3".to_string(), protocols: vec!["openai_chat_completions".to_string()], provider_type: String::new(), + timeout_secs: 120, }, ], revision: "abc123".to_string(), @@ -1545,11 +1553,21 @@ mod tests { routes[0].protocols, vec!["openai_chat_completions", "openai_responses"] ); + assert_eq!( + routes[0].timeout, + openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + "timeout_secs=0 should map to default" + ); assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); assert_eq!( routes[1].auth, openshell_core::inference::AuthHeader::Bearer ); + assert_eq!( + routes[1].timeout, + Duration::from_secs(120), + "timeout_secs=120 should map to 120s" + ); } #[test] @@ -1574,6 +1592,7 @@ mod tests { model_id: "model".to_string(), protocols: vec!["openai_chat_completions".to_string()], provider_type: "openai".to_string(), + timeout_secs: 0, }], revision: "rev".to_string(), generated_at_ms: 0, @@ -1594,6 +1613,7 @@ mod tests { protocols: vec!["openai_chat_completions".to_string()], auth: openshell_core::inference::AuthHeader::Bearer, default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }, openshell_router::config::ResolvedRoute { name: "sandbox-system".to_string(), @@ -1603,6 +1623,7 @@ mod tests { protocols: vec!["anthropic_messages".to_string()], auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }, ]; @@ -1891,6 +1912,7 @@ filesystem_policy: auth: openshell_core::inference::AuthHeader::Bearer, protocols: vec!["openai_chat_completions".to_string()], default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let cache = Arc::new(RwLock::new(routes)); diff --git a/crates/openshell-sandbox/tests/system_inference.rs b/crates/openshell-sandbox/tests/system_inference.rs index 3f6a471e5..5d581fbe2 100644 --- a/crates/openshell-sandbox/tests/system_inference.rs +++ b/crates/openshell-sandbox/tests/system_inference.rs @@ -20,6 +20,7 @@ fn make_system_route() -> ResolvedRoute { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, } } @@ -32,6 +33,7 @@ fn make_user_route() -> ResolvedRoute { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, } } @@ -124,6 +126,7 @@ async fn system_inference_with_anthropic_protocol() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }; let ctx = InferenceContext::new(patterns, router, vec![], vec![system_route]); diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index bbabaf70b..0fb29bde5 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -86,6 +86,7 @@ impl Inference for InferenceService { route_name, &req.provider_name, &req.model_id, + req.timeout_secs, verify, ) .await?; @@ -103,6 +104,7 @@ impl Inference for InferenceService { route_name: route_name.to_string(), validation_performed: !route.validation.is_empty(), validated_endpoints: route.validation, + timeout_secs: config.timeout_secs, })) } @@ -140,6 +142,7 @@ impl Inference for InferenceService { model_id: config.model_id.clone(), version: route.version, route_name: route_name.to_string(), + timeout_secs: config.timeout_secs, })) } } @@ -149,6 +152,7 @@ async fn upsert_cluster_inference_route( route_name: &str, provider_name: &str, model_id: &str, + timeout_secs: u64, verify: bool, ) -> Result { if provider_name.trim().is_empty() { @@ -173,7 +177,7 @@ async fn upsert_cluster_inference_route( Vec::new() }; - let config = build_cluster_inference_config(&provider, model_id); + let config = build_cluster_inference_config(&provider, model_id, timeout_secs); let existing = store .get_message_by_name::(route_name) @@ -204,10 +208,15 @@ async fn upsert_cluster_inference_route( Ok(UpsertedInferenceRoute { route, validation }) } -fn build_cluster_inference_config(provider: &Provider, model_id: &str) -> ClusterInferenceConfig { +fn build_cluster_inference_config( + provider: &Provider, + model_id: &str, + timeout_secs: u64, +) -> ClusterInferenceConfig { ClusterInferenceConfig { provider_name: provider.name.clone(), model_id: model_id.to_string(), + timeout_secs, } } @@ -267,6 +276,7 @@ fn resolve_provider_route(provider: &Provider) -> Result Result Date: Mon, 30 Mar 2026 16:08:26 -0700 Subject: [PATCH 17/45] fix(sandbox): track PTY state per SSH channel to fix terminal resize (#687) Replace flat pty_master/input_sender/pty_request fields in SshHandler with a HashMap so each channel tracks its own PTY resources independently. This fixes window_change_request resizing the wrong PTY when multiple channels are open simultaneously. Also fixes ioctl UB in set_winsize (pass &winsize not winsize by value) and adds warn! logging for unknown channels across all handlers. Resolves #543 --- crates/openshell-sandbox/src/ssh.rs | 198 +++++++++++++++++++++++++--- 1 file changed, 177 insertions(+), 21 deletions(-) diff --git a/crates/openshell-sandbox/src/ssh.rs b/crates/openshell-sandbox/src/ssh.rs index 10eab8c45..e3add8874 100644 --- a/crates/openshell-sandbox/src/ssh.rs +++ b/crates/openshell-sandbox/src/ssh.rs @@ -263,6 +263,19 @@ fn hmac_sha256(key: &[u8], data: &[u8]) -> String { hex::encode(result) } +/// Per-channel state for tracking PTY resources and I/O senders. +/// +/// Each SSH channel gets its own PTY master (if a PTY was requested) and input +/// sender. This allows `window_change_request` to resize the correct PTY when +/// multiple channels are open simultaneously (e.g. parallel shells, shell + +/// sftp, etc.). +#[derive(Default)] +struct ChannelState { + input_sender: Option>>, + pty_master: Option, + pty_request: Option, +} + struct SshHandler { policy: SandboxPolicy, workdir: Option, @@ -270,9 +283,7 @@ struct SshHandler { proxy_url: Option, ca_file_paths: Option>, provider_env: HashMap, - input_sender: Option>>, - pty_master: Option, - pty_request: Option, + channels: HashMap, } impl SshHandler { @@ -291,9 +302,7 @@ impl SshHandler { proxy_url, ca_file_paths, provider_env, - input_sender: None, - pty_master: None, - pty_request: None, + channels: HashMap::new(), } } } @@ -315,12 +324,27 @@ impl russh::server::Handler for SshHandler { async fn channel_open_session( &mut self, - _channel: russh::Channel, + channel: russh::Channel, _session: &mut Session, ) -> Result { + self.channels.insert(channel.id(), ChannelState::default()); Ok(true) } + /// Clean up per-channel state when the channel is closed. + /// + /// This is the final cleanup and subsumes `channel_eof` — if `channel_close` + /// fires without a preceding `channel_eof`, all resources (pty_master File, + /// input_sender) are dropped here. + async fn channel_close( + &mut self, + channel: ChannelId, + _session: &mut Session, + ) -> Result<(), Self::Error> { + self.channels.remove(&channel); + Ok(()) + } + async fn channel_open_direct_tcpip( &mut self, channel: russh::Channel, @@ -388,7 +412,11 @@ impl russh::server::Handler for SshHandler { _modes: &[(russh::Pty, u32)], session: &mut Session, ) -> Result<(), Self::Error> { - self.pty_request = Some(PtyRequest { + let state = self + .channels + .get_mut(&channel) + .ok_or_else(|| anyhow::anyhow!("pty_request on unknown channel {channel:?}"))?; + state.pty_request = Some(PtyRequest { term: term.to_string(), col_width, row_height, @@ -401,21 +429,27 @@ impl russh::server::Handler for SshHandler { async fn window_change_request( &mut self, - _channel: ChannelId, + channel: ChannelId, col_width: u32, row_height: u32, pixel_width: u32, pixel_height: u32, _session: &mut Session, ) -> Result<(), Self::Error> { - if let Some(master) = self.pty_master.as_ref() { + let Some(state) = self.channels.get(&channel) else { + warn!("window_change_request on unknown channel {channel:?}"); + return Ok(()); + }; + if let Some(master) = state.pty_master.as_ref() { let winsize = Winsize { ws_row: to_u16(row_height.max(1)), ws_col: to_u16(col_width.max(1)), ws_xpixel: to_u16(pixel_width), ws_ypixel: to_u16(pixel_height), }; - let _ = unsafe_pty::set_winsize(master.as_raw_fd(), winsize); + if let Err(e) = unsafe_pty::set_winsize(master.as_raw_fd(), winsize) { + warn!("failed to resize PTY for channel {channel:?}: {e}"); + } } Ok(()) } @@ -474,7 +508,10 @@ impl russh::server::Handler for SshHandler { self.ca_file_paths.clone(), &self.provider_env, )?; - self.input_sender = Some(input_sender); + let state = self.channels.get_mut(&channel).ok_or_else(|| { + anyhow::anyhow!("subsystem_request on unknown channel {channel:?}") + })?; + state.input_sender = Some(input_sender); } else { warn!(subsystem = name, "unsupported subsystem requested"); session.channel_failure(channel)?; @@ -499,11 +536,15 @@ impl russh::server::Handler for SshHandler { async fn data( &mut self, - _channel: ChannelId, + channel: ChannelId, data: &[u8], _session: &mut Session, ) -> Result<(), Self::Error> { - if let Some(sender) = self.input_sender.as_ref() { + let Some(state) = self.channels.get(&channel) else { + warn!("data on unknown channel {channel:?}"); + return Ok(()); + }; + if let Some(sender) = state.input_sender.as_ref() { let _ = sender.send(data.to_vec()); } Ok(()) @@ -511,14 +552,18 @@ impl russh::server::Handler for SshHandler { async fn channel_eof( &mut self, - _channel: ChannelId, + channel: ChannelId, _session: &mut Session, ) -> Result<(), Self::Error> { // Drop the input sender so the stdin writer thread sees a // disconnected channel and closes the child's stdin pipe. This // is essential for commands like `cat | tar xf -` which need // stdin EOF to know the input stream is complete. - self.input_sender.take(); + if let Some(state) = self.channels.get_mut(&channel) { + state.input_sender.take(); + } else { + warn!("channel_eof on unknown channel {channel:?}"); + } Ok(()) } } @@ -530,7 +575,11 @@ impl SshHandler { handle: Handle, command: Option, ) -> anyhow::Result<()> { - if let Some(pty) = self.pty_request.take() { + let state = self + .channels + .get_mut(&channel) + .ok_or_else(|| anyhow::anyhow!("start_shell on unknown channel {channel:?}"))?; + if let Some(pty) = state.pty_request.take() { // PTY was requested — allocate a real PTY (interactive shell or // exec that explicitly asked for a terminal). let (pty_master, input_sender) = spawn_pty_shell( @@ -545,8 +594,8 @@ impl SshHandler { self.ca_file_paths.clone(), &self.provider_env, )?; - self.pty_master = Some(pty_master); - self.input_sender = Some(input_sender); + state.pty_master = Some(pty_master); + state.input_sender = Some(input_sender); } else { // No PTY requested — use plain pipes so stdout/stderr are // separate and output has clean LF line endings. This is the @@ -562,7 +611,7 @@ impl SshHandler { self.ca_file_paths.clone(), &self.provider_env, )?; - self.input_sender = Some(input_sender); + state.input_sender = Some(input_sender); } Ok(()) } @@ -999,7 +1048,7 @@ mod unsafe_pty { #[allow(unsafe_code)] pub fn set_winsize(fd: RawFd, winsize: Winsize) -> std::io::Result<()> { - let rc = unsafe { libc::ioctl(fd, libc::TIOCSWINSZ, winsize) }; + let rc = unsafe { libc::ioctl(fd, libc::TIOCSWINSZ, &winsize) }; if rc != 0 { return Err(std::io::Error::last_os_error()); } @@ -1404,4 +1453,111 @@ mod tests { assert!(!is_loopback_host("not-an-ip")); assert!(!is_loopback_host("[]")); } + + // ----------------------------------------------------------------------- + // Per-channel PTY state tests (#543) + // ----------------------------------------------------------------------- + + #[test] + fn set_winsize_applies_to_correct_pty() { + // Verify that set_winsize applies to a specific PTY master FD, + // which is the mechanism that per-channel tracking relies on. + // With the old single-pty_master design, a window_change_request + // for channel N would resize whatever PTY was stored last — + // potentially belonging to a different channel. + let pty_a = openpty(None, None).expect("openpty a"); + let pty_b = openpty(None, None).expect("openpty b"); + let master_a = std::fs::File::from(pty_a.master); + let master_b = std::fs::File::from(pty_b.master); + let fd_a = master_a.as_raw_fd(); + let fd_b = master_b.as_raw_fd(); + assert_ne!(fd_a, fd_b, "two PTYs must have distinct FDs"); + + // Close the slave ends to avoid leaking FDs in the test. + drop(std::fs::File::from(pty_a.slave)); + drop(std::fs::File::from(pty_b.slave)); + + // Resize only PTY B. + let winsize_b = Winsize { + ws_row: 50, + ws_col: 120, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe_pty::set_winsize(fd_b, winsize_b).expect("set_winsize on PTY B"); + + // Resize PTY A to a different size. + let winsize_a = Winsize { + ws_row: 24, + ws_col: 80, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe_pty::set_winsize(fd_a, winsize_a).expect("set_winsize on PTY A"); + + // Read back sizes via ioctl to verify independence. + let mut actual_a: libc::winsize = unsafe { std::mem::zeroed() }; + let mut actual_b: libc::winsize = unsafe { std::mem::zeroed() }; + #[allow(unsafe_code)] + unsafe { + libc::ioctl(fd_a, libc::TIOCGWINSZ, &mut actual_a); + libc::ioctl(fd_b, libc::TIOCGWINSZ, &mut actual_b); + } + + assert_eq!(actual_a.ws_row, 24, "PTY A should be 24 rows"); + assert_eq!(actual_a.ws_col, 80, "PTY A should be 80 cols"); + assert_eq!(actual_b.ws_row, 50, "PTY B should be 50 rows"); + assert_eq!(actual_b.ws_col, 120, "PTY B should be 120 cols"); + } + + #[test] + fn channel_state_independent_input_senders() { + // Verify that each channel gets its own input sender so that + // data() and channel_eof() affect only the targeted channel. + let (tx_a, rx_a) = mpsc::channel::>(); + let (tx_b, rx_b) = mpsc::channel::>(); + + let mut state_a = ChannelState { + input_sender: Some(tx_a), + ..Default::default() + }; + let state_b = ChannelState { + input_sender: Some(tx_b), + ..Default::default() + }; + + // Send data to channel A only. + state_a + .input_sender + .as_ref() + .unwrap() + .send(b"hello-a".to_vec()) + .unwrap(); + // Send data to channel B only. + state_b + .input_sender + .as_ref() + .unwrap() + .send(b"hello-b".to_vec()) + .unwrap(); + + assert_eq!(rx_a.recv().unwrap(), b"hello-a"); + assert_eq!(rx_b.recv().unwrap(), b"hello-b"); + + // EOF on channel A (drop sender) should not affect channel B. + state_a.input_sender.take(); + assert!( + rx_a.recv().is_err(), + "channel A sender dropped, recv should fail" + ); + + // Channel B should still be functional. + state_b + .input_sender + .as_ref() + .unwrap() + .send(b"still-alive".to_vec()) + .unwrap(); + assert_eq!(rx_b.recv().unwrap(), b"still-alive"); + } } From 047de66b22e6ef228479c4348e48a1296049849b Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 31 Mar 2026 17:23:52 +0200 Subject: [PATCH 18/45] feat(bootstrap,cli): switch GPU injection to CDI where supported (#495) * feat(bootstrap): switch GPU injection to CDI where supported Use an explicit CDI device request (driver="cdi", device_ids=["nvidia.com/gpu=all"]) when the Docker daemon reports CDI spec directories via GET /info (SystemInfo.CDISpecDirs). This makes device injection declarative and decouples spec generation from consumption. When the daemon reports no CDI spec directories, fall back to the legacy NVIDIA device request (driver="nvidia", count=-1) which relies on the NVIDIA Container Runtime hook. Failure modes for both paths are equivalent: a missing or stale NVIDIA Container Toolkit installation will cause container start to fail. CDI spec generation is out of scope for this change; specs are expected to be pre-generated out-of-band, for example by the NVIDIA Container Toolkit. --------- Signed-off-by: Evan Lezar Co-authored-by: Piotr Mlocek --- README.md | 2 +- architecture/gateway-single-node.md | 14 ++- crates/openshell-bootstrap/src/docker.rs | 120 +++++++++++++++++++---- crates/openshell-bootstrap/src/lib.rs | 39 ++++++-- crates/openshell-cli/src/bootstrap.rs | 6 +- crates/openshell-cli/src/main.rs | 15 ++- crates/openshell-cli/src/run.rs | 2 +- docs/sandboxes/manage-gateways.md | 2 +- e2e/rust/tests/cli_smoke.rs | 17 ++-- 9 files changed, 175 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index ccf8bbdef..5549fa7b4 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ OpenShell can pass host GPUs into sandboxes for local inference, fine-tuning, or openshell sandbox create --gpu --from [gpu-enabled-sandbox] -- claude ``` -The CLI auto-bootstraps a GPU-enabled gateway on first use. GPU intent is also inferred automatically for community images with `gpu` in the name. +The CLI auto-bootstraps a GPU-enabled gateway on first use, auto-selecting CDI when available and otherwise falling back to Docker's NVIDIA GPU request path (`--gpus all`). GPU intent is also inferred automatically for community images with `gpu` in the name. **Requirements:** NVIDIA drivers and the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) must be installed on the host. The sandbox image itself must include the appropriate GPU drivers and libraries for your workload — the default `base` image does not. See the [BYOC example](https://github.com/NVIDIA/OpenShell/tree/main/examples/bring-your-own-container) for building a custom sandbox image with GPU support. diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 57aebd3a5..c417775a5 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -296,8 +296,10 @@ When environment variables are set, the entrypoint modifies the HelmChart manife GPU support is part of the single-node gateway bootstrap path rather than a separate architecture. -- `openshell gateway start --gpu` threads a boolean deploy option through `crates/openshell-cli`, `crates/openshell-bootstrap`, and `crates/openshell-bootstrap/src/docker.rs`. -- When enabled, the cluster container is created with Docker `DeviceRequests`, which is the API equivalent of `docker run --gpus all`. +- `openshell gateway start --gpu` threads GPU device options through `crates/openshell-cli`, `crates/openshell-bootstrap`, and `crates/openshell-bootstrap/src/docker.rs`. +- When enabled, the cluster container is created with Docker `DeviceRequests`. The injection mechanism is selected based on whether CDI is enabled on the daemon (`SystemInfo.CDISpecDirs` via `GET /info`): + - **CDI enabled** (daemon reports non-empty `CDISpecDirs`): CDI device injection — `driver="cdi"` with `nvidia.com/gpu=all`. Specs are expected to be pre-generated on the host (e.g. automatically by the `nvidia-cdi-refresh.service` or manually via `nvidia-ctk generate`). + - **CDI not enabled**: `--gpus all` device request — `driver="nvidia"`, `count=-1`, which relies on the NVIDIA Container Runtime hook. - `deploy/docker/Dockerfile.images` installs NVIDIA Container Toolkit packages in a dedicated Ubuntu stage and copies the runtime binaries, config, and `libnvidia-container` shared libraries into the final Ubuntu-based cluster image. - `deploy/docker/cluster-entrypoint.sh` checks `GPU_ENABLED=true` and copies GPU-only manifests from `/opt/openshell/gpu-manifests/` into k3s's manifests directory. - `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. @@ -308,12 +310,16 @@ The runtime chain is: ```text Host GPU drivers & NVIDIA Container Toolkit - └─ Docker: --gpus all (DeviceRequests in bollard API) + └─ Docker: DeviceRequests (CDI when enabled, --gpus all otherwise) └─ k3s/containerd: nvidia-container-runtime on PATH -> auto-detected └─ k8s: nvidia-device-plugin DaemonSet advertises nvidia.com/gpu └─ Pods: request nvidia.com/gpu in resource limits ``` +### `--gpu` flag + +The `--gpu` flag on `gateway start` enables GPU passthrough. OpenShell auto-selects CDI when enabled on the daemon and falls back to Docker's NVIDIA GPU request path (`--gpus all`) otherwise. + The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`. ## Remote Image Transfer @@ -381,7 +387,7 @@ When `openshell sandbox create` cannot connect to a gateway (connection refused, 1. `should_attempt_bootstrap()` in `crates/openshell-cli/src/bootstrap.rs` checks the error type. It returns `true` for connectivity errors and missing default TLS materials, but `false` for TLS handshake/auth errors. 2. If running in a terminal, the user is prompted to confirm. 3. `run_bootstrap()` deploys a gateway named `"openshell"`, sets it as active, and returns fresh `TlsOptions` pointing to the newly-written mTLS certs. -4. When `sandbox create` requests GPU explicitly (`--gpu`) or infers it from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation. +4. When `sandbox create` requests GPU explicitly (`--gpu`) or infers it from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation, using the same CDI-or-fallback selection as `gateway start --gpu`. ## Container Environment Variables diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 9c365bfe8..cc63aacce 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -22,6 +22,29 @@ use std::collections::HashMap; const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; +/// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a +/// concrete device ID based on whether CDI is enabled on the daemon. +/// +/// | Input | Output | +/// |--------------|--------------------------------------------------------------| +/// | `[]` | `[]` — no GPU | +/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | +/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | +/// | `[cdi-ids…]` | unchanged | +pub(crate) fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec { + match gpu { + [] => vec![], + [v] if v == "auto" => { + if cdi_enabled { + vec!["nvidia.com/gpu=all".to_string()] + } else { + vec!["legacy".to_string()] + } + } + other => other.to_vec(), + } +} + const REGISTRY_MODE_EXTERNAL: &str = "external"; fn env_non_empty(key: &str) -> Option { @@ -454,7 +477,7 @@ pub async fn ensure_container( disable_gateway_auth: bool, registry_username: Option<&str>, registry_token: Option<&str>, - gpu: bool, + device_ids: &[String], ) -> Result<()> { let container_name = container_name(name); @@ -542,21 +565,35 @@ pub async fn ensure_container( ..Default::default() }; - // When GPU support is requested, add NVIDIA device requests. - // This is the programmatic equivalent of `docker run --gpus all`. - // The NVIDIA Container Toolkit runtime hook injects /dev/nvidia* devices - // and GPU driver libraries from the host into the container. - if gpu { - host_config.device_requests = Some(vec![DeviceRequest { - driver: Some("nvidia".to_string()), - count: Some(-1), // all GPUs - capabilities: Some(vec![vec![ - "gpu".to_string(), - "utility".to_string(), - "compute".to_string(), - ]]), - ..Default::default() - }]); + // Inject GPU devices into the container based on the resolved device ID list. + // + // The list is pre-resolved by `resolve_gpu_device_ids` before reaching here: + // [] — no GPU passthrough + // ["legacy"] — internal non-CDI fallback path: `driver="nvidia"`, + // `count=-1`; relies on the NVIDIA Container Runtime hook + // [cdi-ids…] — CDI DeviceRequest (driver="cdi") with the given device IDs; + // Docker resolves them against the host CDI spec at /etc/cdi/ + match device_ids { + [] => {} + [id] if id == "legacy" => { + host_config.device_requests = Some(vec![DeviceRequest { + driver: Some("nvidia".to_string()), + count: Some(-1), // all GPUs + capabilities: Some(vec![vec![ + "gpu".to_string(), + "utility".to_string(), + "compute".to_string(), + ]]), + ..Default::default() + }]); + } + ids => { + host_config.device_requests = Some(vec![DeviceRequest { + driver: Some("cdi".to_string()), + device_ids: Some(ids.to_vec()), + ..Default::default() + }]); + } } let mut cmd = vec![ @@ -671,7 +708,7 @@ pub async fn ensure_container( // GPU support: tell the entrypoint to deploy the NVIDIA device plugin // HelmChart CR so k8s workloads can request nvidia.com/gpu resources. - if gpu { + if !device_ids.is_empty() { env_vars.push("GPU_ENABLED=true".to_string()); } @@ -1195,4 +1232,53 @@ mod tests { "should return a reasonable number of sockets" ); } + + // --- resolve_gpu_device_ids --- + + #[test] + fn resolve_gpu_empty_returns_empty() { + assert_eq!(resolve_gpu_device_ids(&[], true), Vec::::new()); + assert_eq!(resolve_gpu_device_ids(&[], false), Vec::::new()); + } + + #[test] + fn resolve_gpu_auto_cdi_enabled() { + assert_eq!( + resolve_gpu_device_ids(&["auto".to_string()], true), + vec!["nvidia.com/gpu=all"], + ); + } + + #[test] + fn resolve_gpu_auto_cdi_disabled() { + assert_eq!( + resolve_gpu_device_ids(&["auto".to_string()], false), + vec!["legacy"], + ); + } + + #[test] + fn resolve_gpu_legacy_passthrough() { + assert_eq!( + resolve_gpu_device_ids(&["legacy".to_string()], true), + vec!["legacy"], + ); + assert_eq!( + resolve_gpu_device_ids(&["legacy".to_string()], false), + vec!["legacy"], + ); + } + + #[test] + fn resolve_gpu_cdi_ids_passthrough() { + let ids = vec!["nvidia.com/gpu=all".to_string()]; + assert_eq!(resolve_gpu_device_ids(&ids, true), ids); + assert_eq!(resolve_gpu_device_ids(&ids, false), ids); + + let multi = vec![ + "nvidia.com/gpu=0".to_string(), + "nvidia.com/gpu=1".to_string(), + ]; + assert_eq!(resolve_gpu_device_ids(&multi, true), multi); + } } diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 938986757..7dcabe052 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -31,7 +31,8 @@ use crate::constants::{ }; use crate::docker::{ check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container, - ensure_image, ensure_network, ensure_volume, start_container, stop_container, + ensure_image, ensure_network, ensure_volume, resolve_gpu_device_ids, start_container, + stop_container, }; use crate::metadata::{ create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, @@ -111,10 +112,13 @@ pub struct DeployOptions { /// bootstrap pull and inside the k3s cluster at runtime. Only needed /// for private registries. pub registry_token: Option, - /// Enable NVIDIA GPU passthrough. When true, the Docker container is - /// created with GPU device requests (`--gpus all`) and the NVIDIA - /// k8s-device-plugin is deployed inside the k3s cluster. - pub gpu: bool, + /// GPU device IDs to inject into the gateway container. + /// + /// - `[]` — no GPU passthrough (default) + /// - `["legacy"]` — internal non-CDI fallback path (`driver="nvidia"`, `count=-1`) + /// - `["auto"]` — resolved at deploy time: CDI if enabled on the daemon, else the non-CDI fallback + /// - `[cdi-ids…]` — CDI DeviceRequest with the given device IDs + pub gpu: Vec, /// When true, destroy any existing gateway resources before deploying. /// When false, an existing gateway is left as-is and deployment is /// skipped (the caller is responsible for prompting the user first). @@ -133,7 +137,7 @@ impl DeployOptions { disable_gateway_auth: false, registry_username: None, registry_token: None, - gpu: false, + gpu: vec![], recreate: false, } } @@ -187,9 +191,13 @@ impl DeployOptions { self } - /// Enable NVIDIA GPU passthrough for the cluster container. + /// Set GPU device IDs for the cluster container. + /// + /// Pass `vec!["auto"]` to auto-select between CDI and the non-CDI fallback + /// based on daemon capabilities at deploy time. The `legacy` sentinel is an + /// internal implementation detail for the fallback path. #[must_use] - pub fn with_gpu(mut self, gpu: bool) -> Self { + pub fn with_gpu(mut self, gpu: Vec) -> Self { self.gpu = gpu; self } @@ -288,6 +296,18 @@ where (preflight.docker, None) }; + // CDI is considered enabled when the daemon reports at least one CDI spec + // directory via `GET /info` (`SystemInfo.CDISpecDirs`). An empty list or + // missing field means CDI is not configured and we fall back to the legacy + // NVIDIA `DeviceRequest` (driver="nvidia"). Detection is best-effort — + // failure to query daemon info is non-fatal. + let cdi_supported = target_docker + .info() + .await + .ok() + .and_then(|info| info.cdi_spec_dirs) + .is_some_and(|dirs| !dirs.is_empty()); + // If an existing gateway is found, either tear it down (when recreate is // requested) or bail out so the caller can prompt the user / reuse it. if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { @@ -405,6 +425,7 @@ where // leaving an orphaned volume in a corrupted state that blocks retries. // See: https://github.com/NVIDIA/OpenShell/issues/463 let deploy_result: Result = async { + let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported); ensure_container( &target_docker, &name, @@ -416,7 +437,7 @@ where disable_gateway_auth, registry_username.as_deref(), registry_token.as_deref(), - gpu, + &device_ids, ) .await?; start_container(&target_docker, &name).await?; diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index e976061fa..ea6410b91 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -178,7 +178,11 @@ pub async fn run_bootstrap( { options = options.with_gateway_host(host); } - options = options.with_gpu(gpu); + options = options.with_gpu(if gpu { + vec!["auto".to_string()] + } else { + vec![] + }); let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; let server = handle.gateway_endpoint().to_string(); diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index df37410b6..4f4d49695 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -807,6 +807,10 @@ enum GatewayCommands { /// NVIDIA k8s-device-plugin so Kubernetes workloads can request /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the /// NVIDIA Container Toolkit on the host. + /// + /// When enabled, OpenShell auto-selects CDI when the Docker daemon has + /// CDI enabled and falls back to Docker's NVIDIA GPU request path + /// (`--gpus all`) otherwise. #[arg(long)] gpu: bool, }, @@ -1112,8 +1116,10 @@ enum SandboxCommands { /// Request GPU resources for the sandbox. /// /// When no gateway is running, auto-bootstrap starts a GPU-enabled - /// gateway. GPU intent is also inferred automatically for known - /// GPU-designated image names such as `nvidia-gpu`. + /// gateway using the same automatic injection selection as + /// `openshell gateway start --gpu`. GPU intent is also inferred + /// automatically for known GPU-designated image names such as + /// `nvidia-gpu`. #[arg(long)] gpu: bool, @@ -1570,6 +1576,11 @@ async fn main() -> Result<()> { registry_token, gpu, } => { + let gpu = if gpu { + vec!["auto".to_string()] + } else { + vec![] + }; run::gateway_admin_deploy( &name, remote.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index bab819137..67eafc886 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1355,7 +1355,7 @@ pub async fn gateway_admin_deploy( disable_gateway_auth: bool, registry_username: Option<&str>, registry_token: Option<&str>, - gpu: bool, + gpu: Vec, ) -> Result<()> { let location = if remote.is_some() { "remote" } else { "local" }; diff --git a/docs/sandboxes/manage-gateways.md b/docs/sandboxes/manage-gateways.md index 2f3dba7a9..d4d9ccf55 100644 --- a/docs/sandboxes/manage-gateways.md +++ b/docs/sandboxes/manage-gateways.md @@ -168,7 +168,7 @@ $ openshell gateway info --name my-remote-cluster | Flag | Purpose | |---|---| -| `--gpu` | Enable NVIDIA GPU passthrough. Requires NVIDIA drivers and the Container Toolkit on the host. | +| `--gpu` | Enable NVIDIA GPU passthrough. Requires NVIDIA drivers and the Container Toolkit on the host. OpenShell auto-selects CDI when enabled on the daemon and falls back to Docker's NVIDIA GPU request path (`--gpus all`) otherwise. | | `--plaintext` | Listen on HTTP instead of mTLS. Use behind a TLS-terminating reverse proxy. | | `--disable-gateway-auth` | Skip mTLS client certificate checks. Use when a reverse proxy cannot forward client certs. | | `--registry-username` | Username for registry authentication. Defaults to `__token__` when `--registry-token` is set. Only needed for private registries. Also configurable with `OPENSHELL_REGISTRY_USERNAME`. | diff --git a/e2e/rust/tests/cli_smoke.rs b/e2e/rust/tests/cli_smoke.rs index 35b2801c9..0abc24b43 100644 --- a/e2e/rust/tests/cli_smoke.rs +++ b/e2e/rust/tests/cli_smoke.rs @@ -122,17 +122,22 @@ async fn sandbox_connect_help_shows_editor_flag() { ); } -/// `openshell gateway start --help` must show `--recreate`. +/// `openshell gateway start --help` must show key flags. #[tokio::test] -async fn gateway_start_help_shows_recreate() { +async fn gateway_start_help_shows_key_flags() { let (output, code) = run_isolated(&["gateway", "start", "--help"]).await; assert_eq!(code, 0, "openshell gateway start --help should exit 0"); let clean = strip_ansi(&output); - assert!( - clean.contains("--recreate"), - "expected '--recreate' in gateway start --help:\n{clean}" - ); + for flag in [ + "--gpu", + "--recreate", + ] { + assert!( + clean.contains(flag), + "expected '{flag}' in gateway start --help:\n{clean}" + ); + } } // ------------------------------------------------------------------- From 122bc74948a89f02dd6c042bacd96ba18c0e60da Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 31 Mar 2026 17:35:39 +0200 Subject: [PATCH 19/45] feat(sandbox): switch device plugin to CDI injection mode (#503) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(sandbox): switch device plugin to CDI injection mode Configure the NVIDIA device plugin to use deviceListStrategy=cdi-cri so that GPU devices are injected via direct CDI device requests in the CRI. Sandbox pods now only require the nvidia.com/gpu resource request — runtimeClassName is no longer set on GPU pods. Signed-off-by: Evan Lezar --- .../skills/debug-openshell-cluster/SKILL.md | 44 +++++++++- architecture/gateway-single-node.md | 8 +- crates/openshell-server/src/sandbox/mod.rs | 87 +++++++++++++------ deploy/docker/Dockerfile.images | 16 ++-- .../nvidia-device-plugin-helmchart.yaml | 9 ++ 5 files changed, 122 insertions(+), 42 deletions(-) diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 5af8895cf..4ef851a7e 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -257,7 +257,43 @@ Look for: - `OOMKilled` — memory limits too low - `FailedMount` — volume issues -### Step 8: Check DNS Resolution +### Step 8: Check GPU Device Plugin and CDI (GPU gateways only) + +Skip this step for non-GPU gateways. + +The NVIDIA device plugin DaemonSet must be running and healthy before GPU sandboxes can be created. It uses CDI injection (`deviceListStrategy: cdi-cri`) to inject GPU devices into sandbox pods — no `runtimeClassName` is set on sandbox pods. + +```bash +# DaemonSet status — numberReady must be >= 1 +openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin + +# Device plugin pod logs — look for "CDI" lines confirming CDI mode is active +openshell doctor exec -- kubectl logs -n nvidia-device-plugin -l app.kubernetes.io/name=nvidia-device-plugin --tail=50 + +# List CDI devices registered by the device plugin (requires nvidia-ctk in the cluster image). +# Device plugin CDI entries use the vendor string "k8s.device-plugin.nvidia.com" so entries +# will be prefixed "k8s.device-plugin.nvidia.com/gpu=". If the list is empty, CDI spec +# generation has not completed yet. +openshell doctor exec -- nvidia-ctk cdi list + +# Verify CDI spec files were generated on the node +openshell doctor exec -- ls /var/run/cdi/ + +# Helm install job logs for the device plugin chart +openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-nvidia-device-plugin --tail=100 + +# Confirm a GPU sandbox pod has no runtimeClassName (CDI injection, not runtime class) +openshell doctor exec -- kubectl get pod -n openshell -o jsonpath='{range .items[*]}{.metadata.name}{" runtimeClassName="}{.spec.runtimeClassName}{"\n"}{end}' +``` + +Common issues: + +- **DaemonSet 0/N ready**: The device plugin chart may still be deploying (k3s Helm controller can take 1–2 min) or the pod is crashing. Check pod logs. +- **`nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries**: CDI spec generation has not completed. The device plugin may still be starting or the `cdi-cri` strategy isn't active. Verify `deviceListStrategy: cdi-cri` is in the rendered Helm values. +- **No CDI spec files at `/var/run/cdi/`**: Same as above — device plugin hasn't written CDI specs yet. +- **`HEALTHCHECK_GPU_DEVICE_PLUGIN_NOT_READY` in health check logs**: Device plugin has no ready pods. Check DaemonSet events and pod logs. + +### Step 9: Check DNS Resolution DNS misconfiguration is a common root cause, especially on remote/Linux hosts: @@ -317,6 +353,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` | | Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` target in `deploy/docker/Dockerfile.images`. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker | | `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy && openshell gateway start` | +| `nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries | CDI specs not yet generated by device plugin | Device plugin may still be starting; wait and retry, or check pod logs (Step 8) | ## Full Diagnostic Dump @@ -370,4 +407,9 @@ openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox echo "=== DNS Configuration ===" openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf + +# GPU gateways only +echo "=== GPU Device Plugin ===" +openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin +openshell doctor exec -- nvidia-ctk cdi list ``` diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index c417775a5..0921d3aaa 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -302,7 +302,7 @@ GPU support is part of the single-node gateway bootstrap path rather than a sepa - **CDI not enabled**: `--gpus all` device request — `driver="nvidia"`, `count=-1`, which relies on the NVIDIA Container Runtime hook. - `deploy/docker/Dockerfile.images` installs NVIDIA Container Toolkit packages in a dedicated Ubuntu stage and copies the runtime binaries, config, and `libnvidia-container` shared libraries into the final Ubuntu-based cluster image. - `deploy/docker/cluster-entrypoint.sh` checks `GPU_ENABLED=true` and copies GPU-only manifests from `/opt/openshell/gpu-manifests/` into k3s's manifests directory. -- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. +- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. The chart is configured with `deviceListStrategy: cdi-cri` so the device plugin injects devices via direct CDI device requests in the CRI. - k3s auto-detects `nvidia-container-runtime` on `PATH`, registers the `nvidia` containerd runtime, and creates the `nvidia` `RuntimeClass` automatically. - The OpenShell Helm chart grants the gateway service account cluster-scoped read access to `node.k8s.io/runtimeclasses` and core `nodes` so GPU sandbox admission can verify both the `nvidia` `RuntimeClass` and allocatable GPU capacity before creating a sandbox. @@ -313,14 +313,16 @@ Host GPU drivers & NVIDIA Container Toolkit └─ Docker: DeviceRequests (CDI when enabled, --gpus all otherwise) └─ k3s/containerd: nvidia-container-runtime on PATH -> auto-detected └─ k8s: nvidia-device-plugin DaemonSet advertises nvidia.com/gpu - └─ Pods: request nvidia.com/gpu in resource limits + └─ Pods: request nvidia.com/gpu in resource limits (CDI injection — no runtimeClassName needed) ``` ### `--gpu` flag The `--gpu` flag on `gateway start` enables GPU passthrough. OpenShell auto-selects CDI when enabled on the daemon and falls back to Docker's NVIDIA GPU request path (`--gpus all`) otherwise. -The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`. +Device injection uses CDI (`deviceListStrategy: cdi-cri`): the device plugin injects devices via direct CDI device requests in the CRI. Sandbox pods only need `nvidia.com/gpu: 1` in their resource limits, and GPU pods do not set `runtimeClassName`. + +The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` without `runtimeClassName` and running `nvidia-smi`. ## Remote Image Transfer diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index e10b33d0c..3dca66493 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -31,7 +31,6 @@ pub const SANDBOX_KIND: &str = "Sandbox"; const SANDBOX_ID_LABEL: &str = "openshell.ai/sandbox-id"; const SANDBOX_MANAGED_LABEL: &str = "openshell.ai/managed-by"; const SANDBOX_MANAGED_VALUE: &str = "openshell"; -const GPU_RUNTIME_CLASS_NAME: &str = "nvidia"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const GPU_RESOURCE_QUANTITY: &str = "1"; @@ -127,25 +126,6 @@ impl SandboxClient { } pub async fn validate_gpu_support(&self) -> Result<(), tonic::Status> { - let runtime_classes: Api = Api::all_with( - self.client.clone(), - &ApiResource::from_gvk(&GroupVersionKind::gvk("node.k8s.io", "v1", "RuntimeClass")), - ); - - let runtime_class_exists = runtime_classes - .get_opt(GPU_RUNTIME_CLASS_NAME) - .await - .map_err(|err| { - tonic::Status::internal(format!("check GPU runtime class failed: {err}")) - })? - .is_some(); - - if !runtime_class_exists { - return Err(tonic::Status::failed_precondition( - "GPU sandbox requested, but the active gateway is not GPU-enabled. To start a gateway with GPU support run: `openshell gateway start --gpu`", - )); - } - let nodes: Api = Api::all(self.client.clone()); let node_list = nodes.list(&ListParams::default()).await.map_err(|err| { tonic::Status::internal(format!("check GPU node capacity failed: {err}")) @@ -869,12 +849,7 @@ fn sandbox_template_to_k8s( } let mut spec = serde_json::Map::new(); - if gpu { - spec.insert( - "runtimeClassName".to_string(), - serde_json::json!(GPU_RUNTIME_CLASS_NAME), - ); - } else if !template.runtime_class_name.is_empty() { + if !template.runtime_class_name.is_empty() { spec.insert( "runtimeClassName".to_string(), serde_json::json!(template.runtime_class_name), @@ -1660,7 +1635,7 @@ mod tests { assert_eq!( pod_template["spec"]["runtimeClassName"], - serde_json::json!(GPU_RUNTIME_CLASS_NAME) + serde_json::Value::Null ); assert_eq!( pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], @@ -1668,6 +1643,64 @@ mod tests { ); } + #[test] + fn gpu_sandbox_uses_template_runtime_class_name_when_set() { + let template = SandboxTemplate { + runtime_class_name: "kata-containers".to_string(), + ..SandboxTemplate::default() + }; + + let pod_template = sandbox_template_to_k8s( + &template, + true, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + ); + + assert_eq!( + pod_template["spec"]["runtimeClassName"], + serde_json::json!("kata-containers") + ); + } + + #[test] + fn non_gpu_sandbox_uses_template_runtime_class_name_when_set() { + let template = SandboxTemplate { + runtime_class_name: "kata-containers".to_string(), + ..SandboxTemplate::default() + }; + + let pod_template = sandbox_template_to_k8s( + &template, + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + ); + + assert_eq!( + pod_template["spec"]["runtimeClassName"], + serde_json::json!("kata-containers") + ); + } + #[test] fn gpu_sandbox_preserves_existing_resource_limits() { let template = SandboxTemplate { diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 9cc50085c..6bda277b4 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -201,7 +201,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certifi chmod +x /tmp/helm && \ rm -rf /var/lib/apt/lists/* -FROM ubuntu:24.04 AS nvidia-toolkit +FROM ubuntu:24.04 AS nvidia-container-toolkit ARG NVIDIA_CONTAINER_TOOLKIT_VERSION RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -213,10 +213,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ - "nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ "nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ - "libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ - "libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" && \ rm -rf /var/lib/apt/lists/* # --------------------------------------------------------------------------- @@ -240,13 +237,10 @@ COPY --from=k3s /usr/share/zoneinfo/ /usr/share/zoneinfo/ ENV PATH="/var/lib/rancher/k3s/data/cni:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/bin/aux" \ CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml" -COPY --from=nvidia-toolkit /usr/bin/nvidia-cdi-hook /usr/bin/ -COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime /usr/bin/ -COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime-hook /usr/bin/ -COPY --from=nvidia-toolkit /usr/bin/nvidia-container-cli /usr/bin/ -COPY --from=nvidia-toolkit /usr/bin/nvidia-ctk /usr/bin/ -COPY --from=nvidia-toolkit /etc/nvidia-container-runtime /etc/nvidia-container-runtime -COPY --from=nvidia-toolkit /usr/lib/*-linux-gnu/libnvidia-container*.so* /usr/lib/ +COPY --from=nvidia-container-toolkit /usr/bin/nvidia-cdi-hook /usr/bin/ +COPY --from=nvidia-container-toolkit /usr/bin/nvidia-container-runtime /usr/bin/ +COPY --from=nvidia-container-toolkit /usr/bin/nvidia-ctk /usr/bin/ +COPY --from=nvidia-container-toolkit /etc/nvidia-container-runtime /etc/nvidia-container-runtime COPY --from=supervisor-builder /build/out/openshell-sandbox /opt/openshell/bin/openshell-sandbox RUN mkdir -p /var/lib/rancher/k3s/server/manifests \ diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml index 088562ac9..1cb0ca70a 100644 --- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml +++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml @@ -12,6 +12,10 @@ # (which requires nvidia.com/gpu.present=true) is overridden to empty # so it schedules on any node without requiring NFD/GFD labels. # +# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that +# devices are injected via CDI hooks before container start. Sandbox pods only +# need the nvidia.com/gpu resource request — no runtimeClassName is required. +# # k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia" # RuntimeClass automatically, so no manual RuntimeClass manifest is needed. @@ -28,6 +32,11 @@ spec: createNamespace: true valuesContent: |- runtimeClassName: nvidia + deviceListStrategy: cdi-cri + deviceIDStrategy: index + cdi: + nvidiaHookPath: /usr/bin/nvidia-cdi-hook + nvidiaDriverRoot: "/" gfd: enabled: false nfd: From 0eebbc840fa645a9414803a77774a71396275e89 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 08:55:31 -0700 Subject: [PATCH 20/45] fix(docker): restore apt cleanup chaining in cluster image (#702) --- deploy/docker/Dockerfile.images | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 6bda277b4..79c0d3e2a 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -213,7 +213,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ - "nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ + "nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" && \ rm -rf /var/lib/apt/lists/* # --------------------------------------------------------------------------- From 2538bead50269794f7b000f895e0b8703e7b7907 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 09:35:31 -0700 Subject: [PATCH 21/45] fix(cluster): pass resolv-conf as kubelet arg and pin k3s image digest (#701) --- architecture/gateway-single-node.md | 2 +- deploy/docker/Dockerfile.images | 11 ++++++++++- deploy/docker/cluster-entrypoint.sh | 8 +++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 0921d3aaa..f46a50d2d 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -260,7 +260,7 @@ On Docker custom networks, `/etc/resolv.conf` contains `127.0.0.11` (Docker's in 2. Getting the container's `eth0` IP as a routable address. 3. Adding DNAT rules in PREROUTING to forward DNS from pod namespaces through to Docker's DNS. 4. Writing a custom resolv.conf pointing to the container IP. -5. Passing `--resolv-conf=/etc/rancher/k3s/resolv.conf` to k3s. +5. Passing `--kubelet-arg=resolv-conf=/etc/rancher/k3s/resolv.conf` to k3s. Falls back to `8.8.8.8` / `8.8.4.4` if iptables detection fails. diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 79c0d3e2a..af17b9b0a 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -12,7 +12,11 @@ # supervisor-builder Release openshell-sandbox binary # supervisor-output Minimal stage exporting only the supervisor binary +# Pin by tag AND manifest-list digest to prevent silent upstream republishes +# from breaking the build. Update both when bumping k3s versions. +# To refresh: docker buildx imagetools inspect rancher/k3s: | head -3 ARG K3S_VERSION=v1.35.2-k3s1 +ARG K3S_DIGEST=sha256:c3184157c3048112bab0c3e17405991da486cb3413511eba23f7650efd70776b ARG K9S_VERSION=v0.50.18 ARG HELM_VERSION=v3.17.3 ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1 @@ -181,7 +185,7 @@ CMD ["--port", "8080"] # --------------------------------------------------------------------------- # Cluster asset stages # --------------------------------------------------------------------------- -FROM rancher/k3s:${K3S_VERSION} AS k3s +FROM rancher/k3s:${K3S_VERSION}@${K3S_DIGEST} AS k3s FROM ubuntu:24.04 AS k9s ARG K9S_VERSION @@ -262,6 +266,11 @@ COPY deploy/kube/manifests/*.yaml /opt/openshell/manifests/ COPY deploy/kube/gpu-manifests/*.yaml /opt/openshell/gpu-manifests/ ENTRYPOINT ["/usr/local/bin/cluster-entrypoint.sh"] +# Default to "server" so bare `docker run ` works without requiring +# the caller to pass a subcommand. The openshell CLI already passes +# ["server", "--disable=traefik", ...] as CMD; this default only affects +# manual `docker run` invocations that omit a command. +CMD ["server"] HEALTHCHECK --interval=5s --timeout=5s --start-period=20s --retries=60 \ CMD ["/usr/local/bin/cluster-healthcheck.sh"] diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 2fea6fa61..d4717d88e 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -18,7 +18,7 @@ # embedded DNS resolver at 127.0.0.11. Docker's DNS listens on random high # ports (visible in the DOCKER_OUTPUT iptables chain), so we parse those ports # and set up DNAT rules to forward DNS traffic from k3s pods. We then point -# k3s's --resolv-conf at the container's routable eth0 IP. +# k3s's resolv-conf kubelet arg at the container's routable eth0 IP. # # Per k3s docs: "Manually specified resolver configuration files are not # subject to viability checks." @@ -562,6 +562,8 @@ fi # routing to settle first. wait_for_default_route -# Execute k3s with explicit resolv-conf. +# Execute k3s with explicit resolv-conf passed as a kubelet arg. +# k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag; +# it must be passed via --kubelet-arg instead. # shellcheck disable=SC2086 -exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS +exec /bin/k3s "$@" --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS From 151fca9dc5d7bb2ccc1e42b85b6eae2a71758f1f Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 11:37:20 -0700 Subject: [PATCH 22/45] fix(server): return already_exists for duplicate sandbox names (#695) Check for existing sandbox name before persisting, matching the provider-creation pattern. The CLI now surfaces a clear hint instead of a raw UNIQUE constraint error. Closes #691 --- crates/openshell-cli/src/run.rs | 11 ++++++++++- crates/openshell-server/src/grpc.rs | 14 ++++++++++++++ e2e/python/test_inference_routing.py | 5 ----- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 67eafc886..c4f2833d2 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -2045,7 +2045,16 @@ pub async fn sandbox_create( name: name.unwrap_or_default().to_string(), }; - let response = client.create_sandbox(request).await.into_diagnostic()?; + let response = match client.create_sandbox(request).await { + Ok(resp) => resp, + Err(status) if status.code() == Code::AlreadyExists => { + return Err(miette::miette!( + "{}\n\nhint: delete it first with: openshell sandbox delete \n or use a different name", + status.message() + )); + } + Err(status) => return Err(status).into_diagnostic(), + }; let sandbox = response .into_inner() .sandbox diff --git a/crates/openshell-server/src/grpc.rs b/crates/openshell-server/src/grpc.rs index fd4bf5859..911d2f093 100644 --- a/crates/openshell-server/src/grpc.rs +++ b/crates/openshell-server/src/grpc.rs @@ -244,6 +244,20 @@ impl OpenShell for OpenShellService { ..Default::default() }; + // Reject duplicate names early, before touching the index or store. + // This mirrors the provider-creation pattern (see `create_provider`). + let existing = self + .state + .store + .get_message_by_name::(&name) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))?; + if existing.is_some() { + return Err(Status::already_exists(format!( + "sandbox '{name}' already exists" + ))); + } + // Persist to the store FIRST so the sandbox watcher always finds // the record with `spec` populated. If we created the k8s // resource first, the watcher could race us and write a fallback diff --git a/e2e/python/test_inference_routing.py b/e2e/python/test_inference_routing.py index bda0d8cbc..c35e02535 100644 --- a/e2e/python/test_inference_routing.py +++ b/e2e/python/test_inference_routing.py @@ -95,11 +95,6 @@ def _upsert_managed_inference( except grpc.RpcError as create_exc: if create_exc.code() == grpc.StatusCode.ALREADY_EXISTS: continue - if ( - create_exc.code() == grpc.StatusCode.INTERNAL - and "UNIQUE constraint failed" in (create_exc.details() or "") - ): - continue raise else: raise RuntimeError("failed to upsert managed e2e provider after retries") From a1e1d5412a275f24134223e169d000d9559d0adb Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 12:07:12 -0700 Subject: [PATCH 23/45] fix(bootstrap): stream image push through temp file to prevent OOM (#700) The image push pipeline buffered the entire Docker image tar 3x in memory (export, tar wrap, Bytes copy), causing OOM kills for images over ~1-2 GB. Replace the in-memory pipeline with a temp-file + streaming upload: export to a NamedTempFile, then stream the outer tar (header, 8 MiB file chunks, footer) directly into upload_to_container via body_try_stream. Peak memory drops from ~3x image size to ~8 MiB constant. Also adds incremental export progress reporting every 100 MiB. --- Cargo.lock | 1 + crates/openshell-bootstrap/Cargo.toml | 3 +- crates/openshell-bootstrap/src/push.rs | 193 ++++++++++++++++++------- 3 files changed, 142 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9d8247e5d..7a0215c86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2805,6 +2805,7 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" name = "openshell-bootstrap" version = "0.0.0" dependencies = [ + "async-stream", "base64 0.22.1", "bollard", "bytes", diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index ab57ad57a..942ffc48b 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -11,6 +11,7 @@ rust-version.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } +async-stream = "0.3" base64 = "0.22" bollard = { version = "0.20", features = ["ssh"] } bytes = { workspace = true } @@ -20,11 +21,11 @@ rcgen = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tar = "0.4" +tempfile = "3" tokio = { workspace = true } tracing = { workspace = true } [dev-dependencies] -tempfile = "3" [lints] workspace = true diff --git a/crates/openshell-bootstrap/src/push.rs b/crates/openshell-bootstrap/src/push.rs index 0dcbaa6da..336d46c3e 100644 --- a/crates/openshell-bootstrap/src/push.rs +++ b/crates/openshell-bootstrap/src/push.rs @@ -8,15 +8,23 @@ //! uploaded into the gateway container as a tar file via the Docker //! `put_archive` API, and then imported into containerd via `ctr images import`. //! +//! To avoid unbounded memory usage with large images, the export is streamed +//! to a temporary file on disk, then streamed back through a tar wrapper into +//! the Docker upload API. Peak memory usage is `O(chunk_size)` regardless of +//! image size. +//! //! The standalone `ctr` binary is used (not `k3s ctr` which may not work in //! all k3s versions) with the k3s containerd socket. The default containerd //! namespace in k3s is already `k8s.io`, which is what kubelet uses. +use std::pin::Pin; + use bollard::Docker; use bollard::query_parameters::UploadToContainerOptionsBuilder; use bytes::Bytes; -use futures::StreamExt; +use futures::{Stream, StreamExt}; use miette::{IntoDiagnostic, Result, WrapErr}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use crate::runtime::exec_capture_with_exit; @@ -26,11 +34,19 @@ const CONTAINERD_SOCK: &str = "/run/k3s/containerd/containerd.sock"; /// Path inside the container where the image tar is staged. const IMPORT_TAR_PATH: &str = "/tmp/openshell-images.tar"; +/// Size of chunks read from the temp file during streaming upload (8 MiB). +const UPLOAD_CHUNK_SIZE: usize = 8 * 1024 * 1024; + +/// Report export progress every N bytes (100 MiB). +const PROGRESS_INTERVAL_BYTES: u64 = 100 * 1024 * 1024; + /// Push a list of images from the local Docker daemon into a k3s gateway's /// containerd runtime. /// /// All images are exported as a single tar (shared layers are deduplicated), -/// uploaded to the container filesystem, and imported into containerd. +/// streamed to a temporary file, then uploaded to the container filesystem +/// and imported into containerd. Memory usage is bounded to `O(chunk_size)` +/// regardless of image size. pub async fn push_local_images( local_docker: &Docker, gateway_docker: &Docker, @@ -42,17 +58,30 @@ pub async fn push_local_images( return Ok(()); } - // 1. Export all images from the local Docker daemon as a single tar. - let image_tar = collect_export(local_docker, images).await?; + // 1. Export all images from the local Docker daemon to a temp file. + let (tmp_file, file_size) = export_to_tempfile(local_docker, images, on_log).await?; on_log(format!( "[progress] Exported {} MiB", - image_tar.len() / (1024 * 1024) + file_size / (1024 * 1024) )); - // 2. Wrap the image tar as a file inside an outer tar archive and upload - // it into the container filesystem via the Docker put_archive API. - let outer_tar = wrap_in_tar(IMPORT_TAR_PATH, &image_tar)?; - upload_archive(gateway_docker, container_name, &outer_tar).await?; + // 2. Stream the image tar wrapped in an outer tar archive into the + // container filesystem via the Docker put_archive API. + let parent_dir = IMPORT_TAR_PATH.rsplit_once('/').map_or("/", |(dir, _)| dir); + let options = UploadToContainerOptionsBuilder::default() + .path(parent_dir) + .build(); + + let upload_stream = streaming_tar_upload(IMPORT_TAR_PATH, tmp_file, file_size); + gateway_docker + .upload_to_container( + container_name, + Some(options), + bollard::body_try_stream(upload_stream), + ) + .await + .into_diagnostic() + .wrap_err("failed to upload image tar into container")?; on_log("[progress] Uploaded to gateway".to_string()); // 3. Import the tar into containerd via ctr. @@ -93,59 +122,115 @@ pub async fn push_local_images( Ok(()) } -/// Collect the full export tar from `docker.export_images()` into memory. -async fn collect_export(docker: &Docker, images: &[&str]) -> Result> { +/// Stream the Docker image export directly to a temporary file. +/// +/// Returns the temp file handle and the total number of bytes written. +/// Memory usage is `O(chunk_size)` — only one chunk is held at a time. +/// Progress is reported every [`PROGRESS_INTERVAL_BYTES`] bytes. +async fn export_to_tempfile( + docker: &Docker, + images: &[&str], + on_log: &mut impl FnMut(String), +) -> Result<(tempfile::NamedTempFile, u64)> { + let tmp = tempfile::NamedTempFile::new() + .into_diagnostic() + .wrap_err("failed to create temp file for image export")?; + + // Open a second handle for async writing; the NamedTempFile retains + // ownership and ensures cleanup on drop. + let std_file = tmp + .reopen() + .into_diagnostic() + .wrap_err("failed to reopen temp file for writing")?; + let mut async_file = tokio::fs::File::from_std(std_file); + let mut stream = docker.export_images(images); - let mut buf = Vec::new(); + let mut total_bytes: u64 = 0; + let mut last_reported: u64 = 0; + while let Some(chunk) = stream.next().await { let bytes = chunk .into_diagnostic() .wrap_err("failed to read image export stream")?; - buf.extend_from_slice(&bytes); + async_file + .write_all(&bytes) + .await + .into_diagnostic() + .wrap_err("failed to write image data to temp file")?; + total_bytes += bytes.len() as u64; + + // Report progress periodically. + if total_bytes >= last_reported + PROGRESS_INTERVAL_BYTES { + let mb = total_bytes / (1024 * 1024); + on_log(format!("[progress] Exported {mb} MiB")); + last_reported = total_bytes; + } } - Ok(buf) -} -/// Wrap raw bytes as a single file inside a tar archive. -/// -/// The Docker `put_archive` API expects a tar that is extracted at a target -/// directory. We create a tar containing one entry whose name is the basename -/// of `file_path`, and upload it to the parent directory. -fn wrap_in_tar(file_path: &str, data: &[u8]) -> Result> { - let file_name = file_path.rsplit('/').next().unwrap_or(file_path); - - let mut builder = tar::Builder::new(Vec::new()); - let mut header = tar::Header::new_gnu(); - header.set_path(file_name).into_diagnostic()?; - header.set_size(data.len() as u64); - header.set_mode(0o644); - header.set_cksum(); - builder - .append(&header, data) - .into_diagnostic() - .wrap_err("failed to build tar archive for image upload")?; - builder - .into_inner() + async_file + .flush() + .await .into_diagnostic() - .wrap_err("failed to finalize tar archive") -} - -/// Upload a tar archive into the container at the parent directory of -/// [`IMPORT_TAR_PATH`]. -async fn upload_archive(docker: &Docker, container_name: &str, archive: &[u8]) -> Result<()> { - let parent_dir = IMPORT_TAR_PATH.rsplit_once('/').map_or("/", |(dir, _)| dir); + .wrap_err("failed to flush temp file")?; - let options = UploadToContainerOptionsBuilder::default() - .path(parent_dir) - .build(); + Ok((tmp, total_bytes)) +} - docker - .upload_to_container( - container_name, - Some(options), - bollard::body_full(Bytes::copy_from_slice(archive)), - ) - .await - .into_diagnostic() - .wrap_err("failed to upload image tar into container") +/// Create a stream that yields an outer tar archive containing the image tar +/// as a single entry, reading the image data from the temp file in chunks. +/// +/// The Docker `put_archive` API expects a tar that is extracted at a target +/// directory. We construct a tar with one entry whose name is the basename +/// of `file_path`. The stream yields: +/// 1. A 512-byte GNU tar header +/// 2. The file content in [`UPLOAD_CHUNK_SIZE`] chunks +/// 3. Padding to a 512-byte boundary + two 512-byte zero EOF blocks +/// +/// Memory usage is O([`UPLOAD_CHUNK_SIZE`]) regardless of file size. +fn streaming_tar_upload( + file_path: &str, + tmp_file: tempfile::NamedTempFile, + file_size: u64, +) -> Pin> + Send>> { + let file_name = file_path + .rsplit('/') + .next() + .unwrap_or(file_path) + .to_string(); + + Box::pin(async_stream::try_stream! { + // 1. Build and yield the tar header. + let mut header = tar::Header::new_gnu(); + header.set_path(&file_name)?; + header.set_size(file_size); + header.set_mode(0o644); + header.set_cksum(); + yield Bytes::copy_from_slice(header.as_bytes()); + + // 2. Stream the temp file content in chunks. + let std_file = tmp_file.reopen()?; + let mut async_file = tokio::fs::File::from_std(std_file); + let mut buf = vec![0u8; UPLOAD_CHUNK_SIZE]; + loop { + let n = async_file.read(&mut buf).await?; + if n == 0 { + break; + } + yield Bytes::copy_from_slice(&buf[..n]); + } + + // 3. Yield tar padding and EOF blocks. + // Tar entries must be padded to a 512-byte boundary, followed by + // two 512-byte zero blocks to signal end-of-archive. + let padding_len = if file_size.is_multiple_of(512) { + 0 + } else { + 512 - (file_size % 512) as usize + }; + let footer = vec![0u8; padding_len + 1024]; + yield Bytes::from(footer); + + // The NamedTempFile is dropped here, cleaning up the temp file. + drop(tmp_file); + }) } From 3b4c1d4ec3eafd408156b7b193d5a1451c8c1c44 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:40:00 -0700 Subject: [PATCH 24/45] docs(agents): add security analysis protocol to principal-engineer-reviewer (#711) tmyers@users.noreply.github.com> --- .claude/agents/principal-engineer-reviewer.md | 55 ++++++++++++++++++- .../agents/principal-engineer-reviewer.md | 55 ++++++++++++++++++- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/.claude/agents/principal-engineer-reviewer.md b/.claude/agents/principal-engineer-reviewer.md index fe231f806..ae7e49ea2 100644 --- a/.claude/agents/principal-engineer-reviewer.md +++ b/.claude/agents/principal-engineer-reviewer.md @@ -25,8 +25,12 @@ OpenShell project. Your reviews balance three priorities equally: 3. **Security** — What are the threat surfaces? Are trust boundaries respected? Is input validated at system boundaries? Are secrets, credentials, and - tokens handled correctly? Think about the OWASP top 10, supply chain risks, - and privilege escalation. + tokens handled correctly? Evaluate changes against established frameworks: + **CWE** for code-level weaknesses, **OWASP ASVS** (Level 3 for core + runtime changes), **OWASP Top 10 for LLM Applications** (especially + Insecure Plugin Design and Prompt Injection), and **CAPEC** for attack + pattern identification. Consider supply chain risks and privilege + escalation paths. ## Project context @@ -95,6 +99,53 @@ Structure your review clearly: Omit empty sections. Keep it concise — density over length. +## Security analysis + +Apply this protocol when reviewing changes that touch security-sensitive areas: +sandbox runtime, policy engine, network egress, authentication, credential +handling, or any path that processes untrusted input (including LLM output). + +1. **Threat modeling** — Map the data flow for the change. Where does untrusted + input (from an LLM, user, or network) enter? Where does it exit (to a + shell, filesystem, network, or database)? Identify trust boundaries that + the change crosses. + +2. **Weakness mapping** — Tag every security concern with its **CWE ID**. This + makes findings actionable and trackable. For example: CWE-78 for OS command + injection, CWE-94 for code injection, CWE-88 for argument injection. + +3. **Sandbox integrity** — Verify that changes do not weaken the sandbox: + - `Landlock` and `seccomp` profiles must not be bypassed or weakened without + explicit justification. + - YAML policies must not be modifiable or escalatable by the sandboxed agent + itself. + - Default-deny posture must be preserved. + +4. **Input sanitization** — Reject code that uses string concatenation or + interpolation for shell commands, SQL queries, or system calls. Demand + parameterized execution or strict allow-list validation. + +5. **Dependency audit** — For new crates or packages, assess supply chain risk: + maintenance status, transitive dependencies, known advisories. + +### Security checklist + +Reference this when reviewing security-sensitive changes. Not every item +applies to every PR — use judgment. + +- **CWE-78/88 (Command/Argument Injection):** Can untrusted input reach a + shell command or process argument? +- **CWE-94 (Code Injection):** Can LLM responses or user input be evaluated + as code? +- **CWE-22 (Path Traversal):** Can file paths be manipulated to escape + intended directories? +- **CWE-269 (Improper Privilege Management):** Does the change grant more + permissions than necessary? +- **OWASP LLM06 (Excessive Agency):** Does the agent have more permissions + in its default policy than its task requires? +- **Supply chain:** Do new dependencies introduce known vulnerabilities or + unmaintained transitive dependencies? + ## Principles - Don't nitpick style unless it harms readability. Trust `rustfmt` and the diff --git a/.opencode/agents/principal-engineer-reviewer.md b/.opencode/agents/principal-engineer-reviewer.md index 452548d2c..68c3a86d0 100644 --- a/.opencode/agents/principal-engineer-reviewer.md +++ b/.opencode/agents/principal-engineer-reviewer.md @@ -25,8 +25,12 @@ OpenShell project. Your reviews balance three priorities equally: 3. **Security** — What are the threat surfaces? Are trust boundaries respected? Is input validated at system boundaries? Are secrets, credentials, and - tokens handled correctly? Think about the OWASP top 10, supply chain risks, - and privilege escalation. + tokens handled correctly? Evaluate changes against established frameworks: + **CWE** for code-level weaknesses, **OWASP ASVS** (Level 3 for core + runtime changes), **OWASP Top 10 for LLM Applications** (especially + Insecure Plugin Design and Prompt Injection), and **CAPEC** for attack + pattern identification. Consider supply chain risks and privilege + escalation paths. ## Project context @@ -95,6 +99,53 @@ Structure your review clearly: Omit empty sections. Keep it concise — density over length. +## Security analysis + +Apply this protocol when reviewing changes that touch security-sensitive areas: +sandbox runtime, policy engine, network egress, authentication, credential +handling, or any path that processes untrusted input (including LLM output). + +1. **Threat modeling** — Map the data flow for the change. Where does untrusted + input (from an LLM, user, or network) enter? Where does it exit (to a + shell, filesystem, network, or database)? Identify trust boundaries that + the change crosses. + +2. **Weakness mapping** — Tag every security concern with its **CWE ID**. This + makes findings actionable and trackable. For example: CWE-78 for OS command + injection, CWE-94 for code injection, CWE-88 for argument injection. + +3. **Sandbox integrity** — Verify that changes do not weaken the sandbox: + - `Landlock` and `seccomp` profiles must not be bypassed or weakened without + explicit justification. + - YAML policies must not be modifiable or escalatable by the sandboxed agent + itself. + - Default-deny posture must be preserved. + +4. **Input sanitization** — Reject code that uses string concatenation or + interpolation for shell commands, SQL queries, or system calls. Demand + parameterized execution or strict allow-list validation. + +5. **Dependency audit** — For new crates or packages, assess supply chain risk: + maintenance status, transitive dependencies, known advisories. + +### Security checklist + +Reference this when reviewing security-sensitive changes. Not every item +applies to every PR — use judgment. + +- **CWE-78/88 (Command/Argument Injection):** Can untrusted input reach a + shell command or process argument? +- **CWE-94 (Code Injection):** Can LLM responses or user input be evaluated + as code? +- **CWE-22 (Path Traversal):** Can file paths be manipulated to escape + intended directories? +- **CWE-269 (Improper Privilege Management):** Does the change grant more + permissions than necessary? +- **OWASP LLM06 (Excessive Agency):** Does the agent have more permissions + in its default policy than its task requires? +- **Supply chain:** Do new dependencies introduce known vulnerabilities or + unmaintained transitive dependencies? + ## Principles - Don't nitpick style unless it harms readability. Trust `rustfmt` and the From a2f9da5b88911aabb6e99da76b6da6870b8cf4fd Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:21:58 -0700 Subject: [PATCH 25/45] feat(sandbox): extend L7 credential injection to query params, Basic auth, and URL paths (#708) * feat(sandbox): extend L7 credential injection to query params, Basic auth, and URL paths Closes #689 Extend SecretResolver to resolve openshell:resolve:env:* placeholders in URL query parameters, Basic auth tokens, and URL path segments. Absorbs working code from PR #631 for query param and Basic auth support. Adds path rewriting for Telegram-style APIs (/bot{TOKEN}/method). Changes all placeholder rewriting to fail-closed: unresolved placeholders cause HTTP 500 instead of forwarding raw placeholder strings. Validates resolved secret values for CRLF/null injection (CWE-113). Validates path credentials for traversal sequences (CWE-22). Rewrites request targets before OPA L7 policy evaluation. OPA receives a redacted path with [CREDENTIAL] markers. Real secrets appear only in the upstream connection. All log statements use redacted targets. * fix(docs): remove base64 example that triggers secret scanning alert * test(sandbox): add relay integration tests for all credential injection methods Add 9 integration tests that exercise credential injection through the full relay_http_request_with_resolver pipeline with simulated upstream: - Bearer header injection - Exact header value injection (x-api-key) - Basic auth decode/resolve/re-encode - Query parameter injection - URL path injection (Telegram-style concatenated) - URL path injection (standalone segment) - Combined path + header injection - Fail-closed: unresolved header placeholder - Fail-closed: unresolved path placeholder --- Cargo.lock | 1 + architecture/sandbox-providers.md | 33 +- architecture/sandbox.md | 151 ++- crates/openshell-sandbox/Cargo.toml | 3 + crates/openshell-sandbox/src/l7/relay.rs | 73 +- crates/openshell-sandbox/src/l7/rest.rs | 405 ++++++- crates/openshell-sandbox/src/proxy.rs | 39 +- crates/openshell-sandbox/src/secrets.rs | 1270 +++++++++++++++++++++- docs/sandboxes/manage-providers.md | 41 + 9 files changed, 1937 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a0215c86..e49301771 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2936,6 +2936,7 @@ name = "openshell-sandbox" version = "0.0.0" dependencies = [ "anyhow", + "base64 0.22.1", "bytes", "clap", "hex", diff --git a/architecture/sandbox-providers.md b/architecture/sandbox-providers.md index 16b7948bc..fe5d48a97 100644 --- a/architecture/sandbox-providers.md +++ b/architecture/sandbox-providers.md @@ -305,18 +305,31 @@ start from `env_clear()`, so the handshake secret is not present there. ### Proxy-Time Secret Resolution -When a sandboxed tool uses one of these placeholder env vars to populate an outbound HTTP -header (for example `Authorization: Bearer openshell:resolve:env:ANTHROPIC_API_KEY`), the -sandbox proxy rewrites the placeholder to the real secret value immediately before the -request is forwarded upstream. - -This applies to: - -- forward-proxy HTTP requests, and -- L7-inspected REST requests inside CONNECT tunnels. +When a sandboxed tool uses one of these placeholder env vars in an outbound HTTP request, +the sandbox proxy rewrites the placeholder to the real secret value immediately before the +request is forwarded upstream. Placeholders are resolved in four locations: + +- **HTTP header values** — exact match (`x-api-key: openshell:resolve:env:KEY`), prefixed + match (`Authorization: Bearer openshell:resolve:env:KEY`), and Base64-decoded Basic auth + tokens (`Authorization: Basic `) +- **URL query parameters** — for APIs that authenticate via query string + (e.g., `?key=openshell:resolve:env:YOUTUBE_API_KEY`) +- **URL path segments** — for APIs that embed tokens in the URL path + (e.g., `/bot/sendMessage` for Telegram Bot API) + +This applies to forward-proxy HTTP requests, L7-inspected REST requests inside CONNECT +tunnels, and credential-injection-only passthrough relays on TLS-terminated connections. + +All rewriting fails closed: if any `openshell:resolve:env:*` placeholder is detected but +cannot be resolved, the proxy rejects the request with HTTP 500 instead of forwarding the +raw placeholder upstream. Resolved secret values are validated for prohibited control +characters (CR, LF, null byte) to prevent header injection (CWE-113). Path segment +credentials are additionally validated to reject traversal sequences, path separators, and +URI delimiters (CWE-22). The real secret value remains in supervisor memory only; it is not re-injected into the -child process environment. +child process environment. See [Credential injection](sandbox.md#credential-injection) for +the full implementation details, encoding rules, and security properties. ### End-to-End Flow diff --git a/architecture/sandbox.md b/architecture/sandbox.md index 333cef5ea..c870708dd 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -33,6 +33,7 @@ All paths are relative to `crates/openshell-sandbox/src/`. | `l7/relay.rs` | Protocol-aware bidirectional relay with per-request OPA evaluation, credential-injection-only passthrough relay | | `l7/rest.rs` | HTTP/1.1 request/response parsing, body framing (Content-Length, chunked), deny response generation | | `l7/provider.rs` | `L7Provider` trait and `L7Request`/`BodyLength` types | +| `secrets.rs` | `SecretResolver` credential placeholder system — placeholder generation, multi-location rewriting (headers, query params, path segments, Basic auth), fail-closed scanning, secret validation, percent-encoding | ## Startup and Orchestration @@ -824,11 +825,13 @@ When `Router::proxy_with_candidates()` returns an error, `router_error_to_http() | `RouterError` variant | HTTP status | Response body | |----------------------|-------------|---------------| -| `RouteNotFound(hint)` | `400` | `no route configured for route '{hint}'` | -| `NoCompatibleRoute(protocol)` | `400` | `no compatible route for source protocol '{protocol}'` | -| `Unauthorized(msg)` | `401` | `{msg}` | -| `UpstreamUnavailable(msg)` | `503` | `{msg}` | -| `UpstreamProtocol(msg)` / `Internal(msg)` | `502` | `{msg}` | +| `RouteNotFound(_)` | `400` | `no inference route configured` | +| `NoCompatibleRoute(_)` | `400` | `no compatible inference route available` | +| `Unauthorized(_)` | `401` | `unauthorized` | +| `UpstreamUnavailable(_)` | `503` | `inference service unavailable` | +| `UpstreamProtocol(_)` / `Internal(_)` | `502` | `inference service error` | + +Response messages are generic — internal details (upstream URLs, hostnames, TLS errors, route hints) are never exposed to the sandboxed process. Full error context is logged server-side at `warn` level. ### Inference routing context @@ -1027,20 +1030,131 @@ TLS termination is automatic. The proxy peeks the first bytes of every CONNECT t System CA bundles are searched at well-known paths: `/etc/ssl/certs/ca-certificates.crt` (Debian/Ubuntu), `/etc/pki/tls/certs/ca-bundle.crt` (RHEL), `/etc/ssl/ca-bundle.pem` (openSUSE), `/etc/ssl/cert.pem` (Alpine/macOS). -### Credential-injection-only relay +### Credential injection + +**Files:** `crates/openshell-sandbox/src/secrets.rs`, `crates/openshell-sandbox/src/l7/relay.rs`, `crates/openshell-sandbox/src/l7/rest.rs`, `crates/openshell-sandbox/src/proxy.rs` + +The sandbox proxy resolves `openshell:resolve:env:*` credential placeholders in outbound HTTP requests. The `SecretResolver` holds a supervisor-only map from placeholder strings to real secret values, constructed at startup from the provider environment. Child processes only see placeholder values in their environment; the proxy rewrites them to real secrets immediately before forwarding upstream. + +#### `SecretResolver` + +```rust +pub(crate) struct SecretResolver { + by_placeholder: HashMap, +} +``` + +`SecretResolver::from_provider_env()` splits the provider environment into two maps: a child-visible map with placeholder values (`openshell:resolve:env:ANTHROPIC_API_KEY`) and a supervisor-only resolver map (`{"openshell:resolve:env:ANTHROPIC_API_KEY": "sk-real-key"}`). The placeholder grammar is `openshell:resolve:env:[A-Za-z_][A-Za-z0-9_]*`. + +#### Credential placement locations + +The resolver rewrites placeholders in four locations within HTTP requests: + +| Location | Example | Encoding | Implementation | +|----------|---------|----------|----------------| +| Header value (exact) | `x-api-key: openshell:resolve:env:KEY` | None (raw replacement) | `rewrite_header_value()` | +| Header value (prefixed) | `Authorization: Bearer openshell:resolve:env:KEY` | None (prefix preserved) | `rewrite_header_value()` | +| Basic auth token | `Authorization: Basic ` | Base64 decode → resolve → re-encode | `rewrite_basic_auth_token()` | +| URL query parameter | `?key=openshell:resolve:env:KEY` | Percent-decode → resolve → percent-encode (RFC 3986 unreserved) | `rewrite_uri_query_params()` | +| URL path segment | `/bot/sendMessage` | Percent-decode → resolve → validate → percent-encode (RFC 3986 pchar) | `rewrite_uri_path()` → `rewrite_path_segment()` | + +**Header values**: Direct match replaces the entire value. Prefixed match (e.g., `Bearer `) splits on whitespace, resolves the placeholder portion, and reassembles. Basic auth match detects `Authorization: Basic `, decodes the Base64 content, resolves any placeholders in the decoded `user:password` string, and re-encodes. + +**Query parameters**: Each `key=value` pair is checked. Values are percent-decoded before resolution and percent-encoded after (RFC 3986 Section 2.3 unreserved characters preserved: `ALPHA / DIGIT / "-" / "." / "_" / "~"`). + +**Path segments**: Handles substring matching for APIs that embed tokens within path segments (e.g., Telegram's `/bot{TOKEN}/sendMessage`). Each segment is percent-decoded, scanned for placeholder boundaries using the env var key grammar (`[A-Za-z_][A-Za-z0-9_]*`), resolved, validated for path safety, and percent-encoded per RFC 3986 Section 3.3 pchar rules (`unreserved / sub-delims / ":" / "@"`). + +#### Path credential validation (CWE-22) + +Resolved credential values destined for URL path segments are validated by `validate_credential_for_path()` before insertion. The following values are rejected: + +| Pattern | Rejection reason | +|---------|-----------------| +| `../`, `..\\`, `..` | Path traversal sequence | +| `/`, `\` | Path separator | +| `\0`, `\r`, `\n` | Control character | +| `?`, `#` | URI delimiter | + +Rejection causes the request to fail closed (HTTP 500). + +#### Secret value validation (CWE-113) + +All resolved credential values are validated at the `resolve_placeholder()` level for prohibited control characters: CR (`\r`), LF (`\n`), and null byte (`\0`). This prevents HTTP header injection via malicious credential values. The validation applies to all placement locations automatically — header values, query parameters, and path segments all pass through `resolve_placeholder()`. + +#### Fail-closed behavior + +All placeholder rewriting fails closed. If any `openshell:resolve:env:*` placeholder is detected in the request but cannot be resolved, the proxy rejects the request with HTTP 500 instead of forwarding the raw placeholder to the upstream. The fail-closed mechanism operates at two levels: + +1. **Per-location**: Each rewrite function (`rewrite_uri_query_params`, `rewrite_path_segment`, `rewrite_header_line`) returns an `UnresolvedPlaceholderError` when a placeholder is detected but the resolver has no mapping for it. + +2. **Final scan**: After all rewriting completes, `rewrite_http_header_block()` scans the output for any remaining `openshell:resolve:env:` tokens. It also checks the percent-decoded form of the request line to catch encoded placeholder bypass attempts (e.g., `openshell%3Aresolve%3Aenv%3AUNKNOWN`). + +```rust +pub(crate) struct UnresolvedPlaceholderError { + pub location: &'static str, // "header", "query_param", "path" +} +``` + +#### Rewrite-before-OPA with redaction + +When L7 inspection is active, credential placeholders in the request target (path + query) are resolved BEFORE OPA L7 policy evaluation. This is implemented in `relay_with_inspection()` and `relay_passthrough_with_credentials()` in `l7/relay.rs`: + +1. `rewrite_target_for_eval()` resolves the request target, producing two strings: + - **Resolved**: real secrets inserted — used only for the upstream connection + - **Redacted**: `[CREDENTIAL]` markers in place of secrets — used for OPA input and logs + +2. OPA `evaluate_l7_request()` receives the redacted path in `request.path`, so policy rules never see real credential values. + +3. All log statements (`L7_REQUEST`, `HTTP_REQUEST`) use the redacted target. Real credential values never appear in logs. + +4. The resolved path (with real secrets) goes only to the upstream via `relay_http_request_with_resolver()`. + +```rust +pub(crate) struct RewriteTargetResult { + pub resolved: String, // for upstream forwarding only + pub redacted: String, // for OPA + logs +} +``` + +If credential resolution fails on the request target, the relay returns HTTP 500 and closes the connection. + +#### Credential-injection-only relay **File:** `crates/openshell-sandbox/src/l7/relay.rs` (`relay_passthrough_with_credentials()`) -When TLS is auto-terminated but no L7 policy (`protocol` + `access`/`rules`) is configured on the endpoint, the proxy enters a passthrough mode that still provides value: it parses HTTP requests minimally to rewrite credential placeholders (via `SecretResolver`) and logs each request for observability. This relay: +When TLS is auto-terminated but no L7 policy (`protocol` + `access`/`rules`) is configured on the endpoint, the proxy enters a passthrough mode that still provides credential injection and observability. This relay: 1. Reads each HTTP request from the client via `RestProvider::parse_request()` -2. Logs the request method, path, host, and port at `info!()` level (tagged `"HTTP relay (credential injection)"`) -3. Forwards the request to upstream via `relay_http_request_with_resolver()`, which rewrites headers containing `openshell:resolve:env:*` placeholders with actual provider credential values -4. Relays the upstream response back to the client -5. Loops for HTTP keep-alive; exits on client close or non-reusable response +2. Resolves and redacts the request target via `rewrite_target_for_eval()` (for log safety) +3. Logs the request method, redacted path, host, and port at `info!()` level (tagged `HTTP_REQUEST`) +4. Forwards the request to upstream via `relay_http_request_with_resolver()`, which rewrites all credential placeholders in headers, query parameters, path segments, and Basic auth tokens +5. Relays the upstream response back to the client +6. Loops for HTTP keep-alive; exits on client close or non-reusable response This enables credential injection on all HTTPS endpoints automatically, without requiring the policy author to add `protocol: rest` and `access: full` just to get credentials injected. +#### Known limitation: host-binding + +The resolver resolves all placeholders regardless of destination host. If an agent has OPA-allowed access to an attacker-controlled host, it could construct a URL containing a placeholder and exfiltrate the resolved credential value to that host. OPA host restrictions are the defense — only endpoints explicitly allowed by policy receive traffic. Per-credential host binding (restricting which credentials resolve for which destination hosts) is not implemented. + +#### Data flow + +```mermaid +sequenceDiagram + participant A as Agent Process + participant P as Proxy (SecretResolver) + participant O as OPA Engine + participant U as Upstream API + + A->>P: GET /bot/send?key= HTTP/1.1
Authorization: Bearer + P->>P: rewrite_target_for_eval(target)
→ resolved: /bot{secret}/send?key={secret}
→ redacted: /bot[CREDENTIAL]/send?key=[CREDENTIAL] + P->>O: evaluate_l7_request(redacted path) + O-->>P: allow + P->>P: rewrite_http_header_block(headers)
→ resolve header placeholders
→ resolve query param placeholders
→ resolve path segment placeholders
→ fail-closed scan + P->>U: GET /bot{secret}/send?key={secret} HTTP/1.1
Authorization: Bearer {secret} + Note over P: Logs use redacted path only +``` + ### REST protocol provider **File:** `crates/openshell-sandbox/src/l7/rest.rs` @@ -1060,11 +1174,12 @@ Implements `L7Provider` for HTTP/1.1: `relay_with_inspection()` in `crates/openshell-sandbox/src/l7/relay.rs` is the main relay loop: 1. Parse one HTTP request from client via the provider -2. Build L7 input JSON with `request.method`, `request.path`, `request.query_params`, plus the CONNECT-level context (host, port, binary, ancestors, cmdline) -3. Evaluate `data.openshell.sandbox.allow_request` and `data.openshell.sandbox.request_deny_reason` -4. Log the L7 decision (tagged `L7_REQUEST`) -5. If allowed (or audit mode): relay request to upstream and response back to client, then loop -6. If denied in enforce mode: send 403 and close the connection +2. Resolve credential placeholders in the request target via `rewrite_target_for_eval()`. OPA receives the redacted path (`[CREDENTIAL]` markers); the resolved path goes only to upstream. If resolution fails, return HTTP 500 and close the connection. +3. Build L7 input JSON with `request.method`, the **redacted** `request.path`, `request.query_params`, plus the CONNECT-level context (host, port, binary, ancestors, cmdline) +4. Evaluate `data.openshell.sandbox.allow_request` and `data.openshell.sandbox.request_deny_reason` +5. Log the L7 decision (tagged `L7_REQUEST`) using the redacted target — real credential values never appear in logs +6. If allowed (or audit mode): relay request to upstream via `relay_http_request_with_resolver()` (which rewrites all remaining credential placeholders in headers, query parameters, path segments, and Basic auth tokens) and relay the response back to client, then loop +7. If denied in enforce mode: send 403 (using redacted target in the response body) and close the connection ## Process Identity @@ -1317,6 +1432,10 @@ The sandbox uses `miette` for error reporting and `thiserror` for typed errors. | Log push gRPC stream breaks | Push loop exits, flushes remaining batch | | Proxy accept error | Log + break accept loop | | Benign connection close (EOF, reset, pipe) | Debug level (not visible to user by default) | +| Credential injection: unresolved placeholder detected | HTTP 500, connection closed (fail-closed) | +| Credential injection: resolved value contains CR/LF/null | Placeholder treated as unresolvable, fail-closed | +| Credential injection: path credential contains traversal/separator | HTTP 500, connection closed (fail-closed) | +| Credential injection: percent-encoded placeholder bypass attempt | HTTP 500, connection closed (fail-closed) | | L7 parse error | Close the connection | | SSH server failure | Async task error logged, main process unaffected | | Process timeout | Kill process, return exit code 124 | diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 8a0639a7d..26da57efd 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -52,6 +52,9 @@ webpki-roots = { workspace = true } # HTTP bytes = { workspace = true } +# Encoding +base64 = { workspace = true } + # IP network / CIDR parsing ipnet = "2" diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index e0ad2a18c..49caea64d 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -9,10 +9,10 @@ use crate::l7::provider::L7Provider; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; -use crate::secrets::SecretResolver; +use crate::secrets::{self, SecretResolver}; use miette::{IntoDiagnostic, Result, miette}; use std::sync::{Arc, Mutex}; -use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tracing::{debug, info, warn}; /// Context for L7 request policy evaluation. @@ -105,13 +105,36 @@ where } }; + // Rewrite credential placeholders in the request target BEFORE OPA + // evaluation. OPA sees the redacted path; the resolved path goes only + // to the upstream write. + let (eval_target, redacted_target) = if let Some(ref resolver) = ctx.secret_resolver { + match secrets::rewrite_target_for_eval(&req.target, resolver) { + Ok(result) => (result.resolved, result.redacted), + Err(e) => { + warn!( + host = %ctx.host, + port = ctx.port, + error = %e, + "credential resolution failed in request target, rejecting" + ); + let response = b"HTTP/1.1 500 Internal Server Error\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"; + client.write_all(response).await.into_diagnostic()?; + client.flush().await.into_diagnostic()?; + return Ok(()); + } + } + } else { + (req.target.clone(), req.target.clone()) + }; + let request_info = L7RequestInfo { action: req.action.clone(), - target: req.target.clone(), + target: redacted_target.clone(), query_params: req.query_params.clone(), }; - // Evaluate L7 policy via Rego + // Evaluate L7 policy via Rego (using redacted target) let (allowed, reason) = evaluate_l7_request(engine, ctx, &request_info)?; let decision_str = match (allowed, config.enforcement) { @@ -120,20 +143,23 @@ where (false, EnforcementMode::Enforce) => "deny", }; - // Log every L7 decision + // Log every L7 decision (using redacted target — never log real secrets) info!( dst_host = %ctx.host, dst_port = ctx.port, policy = %ctx.policy_name, l7_protocol = "rest", l7_action = %request_info.action, - l7_target = %request_info.target, + l7_target = %redacted_target, l7_query_params = ?request_info.query_params, l7_decision = decision_str, l7_deny_reason = %reason, "L7_REQUEST", ); + // Store the resolved target for the deny response redaction + let _ = &eval_target; + if allowed || config.enforcement == EnforcementMode::Audit { // Forward request to upstream and relay response let reusable = crate::l7::rest::relay_http_request_with_resolver( @@ -152,9 +178,15 @@ where return Ok(()); } } else { - // Enforce mode: deny with 403 and close connection + // Enforce mode: deny with 403 and close connection (use redacted target) crate::l7::rest::RestProvider - .deny(&req, &ctx.policy_name, &reason, client) + .deny_with_redacted_target( + &req, + &ctx.policy_name, + &reason, + client, + Some(&redacted_target), + ) .await?; return Ok(()); } @@ -266,13 +298,34 @@ where request_count += 1; - // Log for observability. + // Resolve and redact the target for logging. + let redacted_target = if let Some(ref res) = ctx.secret_resolver { + match secrets::rewrite_target_for_eval(&req.target, res) { + Ok(result) => result.redacted, + Err(e) => { + warn!( + host = %ctx.host, + port = ctx.port, + error = %e, + "credential resolution failed in request target, rejecting" + ); + let response = b"HTTP/1.1 500 Internal Server Error\r\nContent-Length: 0\r\nConnection: close\r\n\r\n"; + client.write_all(response).await.into_diagnostic()?; + client.flush().await.into_diagnostic()?; + return Ok(()); + } + } + } else { + req.target.clone() + }; + + // Log for observability (using redacted target — never log real secrets). let has_creds = resolver.is_some(); info!( host = %ctx.host, port = ctx.port, method = %req.action, - path = %req.target, + path = %redacted_target, credentials_injected = has_creds, request_num = request_count, "HTTP_REQUEST", diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index da453ce16..ec5494c9d 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -47,7 +47,21 @@ impl L7Provider for RestProvider { reason: &str, client: &mut C, ) -> Result<()> { - send_deny_response(req, policy_name, reason, client).await + send_deny_response(req, policy_name, reason, client, None).await + } +} + +impl RestProvider { + /// Deny with a redacted target for the response body. + pub(crate) async fn deny_with_redacted_target( + &self, + req: &L7Request, + policy_name: &str, + reason: &str, + client: &mut C, + redacted_target: Option<&str>, + ) -> Result<()> { + send_deny_response(req, policy_name, reason, client, redacted_target).await } } @@ -247,10 +261,11 @@ where .position(|w| w == b"\r\n\r\n") .map_or(req.raw_header.len(), |p| p + 4); - let rewritten_header = rewrite_http_header_block(&req.raw_header[..header_end], resolver); + let rewrite_result = rewrite_http_header_block(&req.raw_header[..header_end], resolver) + .map_err(|e| miette!("credential injection failed: {e}"))?; upstream - .write_all(&rewritten_header) + .write_all(&rewrite_result.rewritten) .await .into_diagnostic()?; @@ -278,16 +293,21 @@ where } /// Send a 403 Forbidden JSON deny response. +/// +/// When `redacted_target` is provided, it is used instead of `req.target` +/// in the response body to avoid leaking resolved credential values. async fn send_deny_response( req: &L7Request, policy_name: &str, reason: &str, client: &mut C, + redacted_target: Option<&str>, ) -> Result<()> { + let target = redacted_target.unwrap_or(&req.target); let body = serde_json::json!({ "error": "policy_denied", "policy": policy_name, - "rule": format!("{} {}", req.action, req.target), + "rule": format!("{} {}", req.action, target), "detail": reason }); let body_bytes = body.to_string(); @@ -742,6 +762,7 @@ fn is_benign_close(err: &std::io::Error) -> bool { mod tests { use super::*; use crate::secrets::SecretResolver; + use base64::Engine as _; #[test] fn parse_content_length() { @@ -1371,8 +1392,8 @@ mod tests { ); let raw = b"GET /v1/messages HTTP/1.1\r\nAuthorization: Bearer openshell:resolve:env:ANTHROPIC_API_KEY\r\nHost: example.com\r\n\r\n"; - let rewritten = rewrite_http_header_block(raw, resolver.as_ref()); - let rewritten = String::from_utf8(rewritten).expect("utf8"); + let result = rewrite_http_header_block(raw, resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); assert!(rewritten.contains("Authorization: Bearer sk-test\r\n")); assert!(!rewritten.contains("openshell:resolve:env:ANTHROPIC_API_KEY")); @@ -1552,4 +1573,376 @@ mod tests { "Real secret should NOT appear without resolver, got: {forwarded}" ); } + + // ========================================================================= + // Credential injection integration tests + // + // Each test exercises a different injection location through the full + // relay_http_request_with_resolver pipeline: child builds an HTTP request + // with a placeholder, the relay rewrites it, and we verify what upstream + // receives. + // ========================================================================= + + /// Helper: run a request through the relay and capture what upstream receives. + async fn relay_and_capture( + raw_header: Vec, + body_length: BodyLength, + resolver: Option<&SecretResolver>, + ) -> Result { + let (mut proxy_to_upstream, mut upstream_side) = tokio::io::duplex(8192); + let (mut _app_side, mut proxy_to_client) = tokio::io::duplex(8192); + + // Parse the request line to extract action and target for L7Request + let header_str = String::from_utf8_lossy(&raw_header); + let first_line = header_str.lines().next().unwrap_or(""); + let parts: Vec<&str> = first_line.splitn(3, ' ').collect(); + let action = parts.first().unwrap_or(&"GET").to_string(); + let target = parts.get(1).unwrap_or(&"/").to_string(); + + let req = L7Request { + action, + target, + query_params: HashMap::new(), + raw_header, + body_length, + }; + + let content_len = match body_length { + BodyLength::ContentLength(n) => n, + _ => 0, + }; + + let upstream_task = tokio::spawn(async move { + let mut buf = vec![0u8; 8192]; + let mut total = 0; + loop { + let n = upstream_side.read(&mut buf[total..]).await.unwrap(); + if n == 0 { + break; + } + total += n; + if let Some(hdr_end) = buf[..total].windows(4).position(|w| w == b"\r\n\r\n") { + if total >= hdr_end + 4 + content_len as usize { + break; + } + } + } + upstream_side + .write_all(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nok") + .await + .unwrap(); + upstream_side.flush().await.unwrap(); + String::from_utf8_lossy(&buf[..total]).to_string() + }); + + let relay = tokio::time::timeout( + std::time::Duration::from_secs(5), + relay_http_request_with_resolver( + &req, + &mut proxy_to_client, + &mut proxy_to_upstream, + resolver, + ), + ) + .await + .map_err(|_| miette!("relay timed out"))?; + relay?; + + let forwarded = upstream_task + .await + .map_err(|e| miette!("upstream task failed: {e}"))?; + Ok(forwarded) + } + + #[tokio::test] + async fn relay_injects_bearer_header_credential() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("API_KEY".to_string(), "sk-real-secret-key".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("API_KEY").unwrap(); + + let raw = format!( + "POST /v1/chat HTTP/1.1\r\n\ + Host: api.example.com\r\n\ + Authorization: Bearer {placeholder}\r\n\ + Content-Length: 2\r\n\r\n{{}}" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(2), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("Authorization: Bearer sk-real-secret-key\r\n"), + "Upstream should see real Bearer token, got: {forwarded}" + ); + assert!( + !forwarded.contains("openshell:resolve:env:"), + "Placeholder leaked to upstream: {forwarded}" + ); + } + + #[tokio::test] + async fn relay_injects_exact_header_credential() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("CUSTOM_TOKEN".to_string(), "tok-12345".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("CUSTOM_TOKEN").unwrap(); + + let raw = format!( + "GET /api/data HTTP/1.1\r\n\ + Host: api.example.com\r\n\ + x-api-key: {placeholder}\r\n\ + Content-Length: 0\r\n\r\n" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(0), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("x-api-key: tok-12345\r\n"), + "Upstream should see real x-api-key, got: {forwarded}" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_injects_basic_auth_credential() { + let b64 = base64::engine::general_purpose::STANDARD; + + let (child_env, resolver) = SecretResolver::from_provider_env( + [("REGISTRY_PASS".to_string(), "hunter2".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("REGISTRY_PASS").unwrap(); + let encoded = b64.encode(format!("deploy:{placeholder}").as_bytes()); + + let raw = format!( + "GET /v2/_catalog HTTP/1.1\r\n\ + Host: registry.example.com\r\n\ + Authorization: Basic {encoded}\r\n\ + Content-Length: 0\r\n\r\n" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(0), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + // Extract and decode the Basic auth token from what upstream received + let auth_line = forwarded + .lines() + .find(|l| l.starts_with("Authorization: Basic")) + .expect("upstream should have Authorization header"); + let token = auth_line + .strip_prefix("Authorization: Basic ") + .unwrap() + .trim(); + let decoded = b64.decode(token).expect("valid base64"); + let decoded_str = std::str::from_utf8(&decoded).expect("valid utf8"); + + assert_eq!( + decoded_str, "deploy:hunter2", + "Decoded Basic auth should contain real password" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_injects_query_param_credential() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("YOUTUBE_KEY".to_string(), "AIzaSy-secret".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("YOUTUBE_KEY").unwrap(); + + let raw = format!( + "GET /v3/search?part=snippet&key={placeholder} HTTP/1.1\r\n\ + Host: www.googleapis.com\r\n\ + Content-Length: 0\r\n\r\n" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(0), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("key=AIzaSy-secret"), + "Upstream should see real API key in query param, got: {forwarded}" + ); + assert!( + forwarded.contains("part=snippet"), + "Non-secret query params should be preserved, got: {forwarded}" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_injects_url_path_credential_telegram_style() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [( + "TELEGRAM_TOKEN".to_string(), + "123456:ABC-DEF1234ghIkl".to_string(), + )] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("TELEGRAM_TOKEN").unwrap(); + + let raw = format!( + "POST /bot{placeholder}/sendMessage HTTP/1.1\r\n\ + Host: api.telegram.org\r\n\ + Content-Length: 2\r\n\r\n{{}}" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(2), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("POST /bot123456:ABC-DEF1234ghIkl/sendMessage HTTP/1.1"), + "Upstream should see real token in URL path, got: {forwarded}" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_injects_url_path_credential_standalone_segment() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("ORG_TOKEN".to_string(), "org-abc-789".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("ORG_TOKEN").unwrap(); + + let raw = format!( + "GET /api/{placeholder}/resources HTTP/1.1\r\n\ + Host: api.example.com\r\n\ + Content-Length: 0\r\n\r\n" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(0), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("GET /api/org-abc-789/resources HTTP/1.1"), + "Upstream should see real token in path segment, got: {forwarded}" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_injects_combined_path_and_header_credentials() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [ + ("PATH_TOKEN".to_string(), "tok-path-123".to_string()), + ("HEADER_KEY".to_string(), "sk-header-456".to_string()), + ] + .into_iter() + .collect(), + ); + let path_ph = child_env.get("PATH_TOKEN").unwrap(); + let header_ph = child_env.get("HEADER_KEY").unwrap(); + + let raw = format!( + "POST /bot{path_ph}/send HTTP/1.1\r\n\ + Host: api.example.com\r\n\ + x-api-key: {header_ph}\r\n\ + Content-Length: 2\r\n\r\n{{}}" + ); + + let forwarded = relay_and_capture( + raw.into_bytes(), + BodyLength::ContentLength(2), + resolver.as_ref(), + ) + .await + .expect("relay should succeed"); + + assert!( + forwarded.contains("/bottok-path-123/send"), + "Upstream should see real token in path, got: {forwarded}" + ); + assert!( + forwarded.contains("x-api-key: sk-header-456\r\n"), + "Upstream should see real token in header, got: {forwarded}" + ); + assert!(!forwarded.contains("openshell:resolve:env:")); + } + + #[tokio::test] + async fn relay_fail_closed_rejects_unresolved_placeholder() { + // Create a resolver that knows about KEY1 but not UNKNOWN_KEY + let (child_env, resolver) = SecretResolver::from_provider_env( + [("KEY1".to_string(), "secret1".to_string())] + .into_iter() + .collect(), + ); + let _ = child_env; + + // The request references a placeholder that the resolver doesn't know + let raw = b"GET /api HTTP/1.1\r\n\ + Host: example.com\r\n\ + x-api-key: openshell:resolve:env:UNKNOWN_KEY\r\n\ + Content-Length: 0\r\n\r\n" + .to_vec(); + + let result = relay_and_capture(raw, BodyLength::ContentLength(0), resolver.as_ref()).await; + + assert!( + result.is_err(), + "Relay should fail when placeholder cannot be resolved" + ); + } + + #[tokio::test] + async fn relay_fail_closed_rejects_unresolved_path_placeholder() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY1".to_string(), "secret1".to_string())] + .into_iter() + .collect(), + ); + + let raw = + b"GET /api/openshell:resolve:env:UNKNOWN_KEY/data HTTP/1.1\r\nHost: x\r\nContent-Length: 0\r\n\r\n" + .to_vec(); + + let result = relay_and_capture(raw, BodyLength::ContentLength(0), resolver.as_ref()).await; + + assert!( + result.is_err(), + "Relay should fail when path placeholder cannot be resolved" + ); + } } diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 49fe5e07b..ce980a45c 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1617,7 +1617,7 @@ fn rewrite_forward_request( used: usize, path: &str, secret_resolver: Option<&SecretResolver>, -) -> Vec { +) -> Result, crate::secrets::UnresolvedPlaceholderError> { let header_end = raw[..used] .windows(4) .position(|w| w == b"\r\n\r\n") @@ -1698,7 +1698,15 @@ fn rewrite_forward_request( output.extend_from_slice(&raw[header_end..used]); } - output + // Fail-closed: scan for any remaining unresolved placeholders + if secret_resolver.is_some() { + let output_str = String::from_utf8_lossy(&output); + if output_str.contains(crate::secrets::PLACEHOLDER_PREFIX_PUBLIC) { + return Err(crate::secrets::UnresolvedPlaceholderError { location: "header" }); + } + } + + Ok(output) } /// Handle a plain HTTP forward proxy request (non-CONNECT). @@ -2040,7 +2048,19 @@ async fn handle_forward_proxy( ); // 9. Rewrite request and forward to upstream - let rewritten = rewrite_forward_request(buf, used, &path, secret_resolver.as_deref()); + let rewritten = match rewrite_forward_request(buf, used, &path, secret_resolver.as_deref()) { + Ok(bytes) => bytes, + Err(e) => { + warn!( + dst_host = %host_lc, + dst_port = port, + error = %e, + "credential injection failed in forward proxy" + ); + respond(client, b"HTTP/1.1 500 Internal Server Error\r\n\r\n").await?; + return Ok(()); + } + }; upstream.write_all(&rewritten).await.into_diagnostic()?; // 8. Relay remaining traffic bidirectionally (supports streaming) @@ -2740,7 +2760,7 @@ mod tests { fn test_rewrite_get_request() { let raw = b"GET http://10.0.0.1:8000/api HTTP/1.1\r\nHost: 10.0.0.1:8000\r\nAccept: */*\r\n\r\n"; - let result = rewrite_forward_request(raw, raw.len(), "/api", None); + let result = rewrite_forward_request(raw, raw.len(), "/api", None).expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!(result_str.starts_with("GET /api HTTP/1.1\r\n")); assert!(result_str.contains("Host: 10.0.0.1:8000")); @@ -2751,7 +2771,7 @@ mod tests { #[test] fn test_rewrite_strips_proxy_headers() { let raw = b"GET http://host/p HTTP/1.1\r\nHost: host\r\nProxy-Authorization: Basic abc\r\nProxy-Connection: keep-alive\r\nAccept: */*\r\n\r\n"; - let result = rewrite_forward_request(raw, raw.len(), "/p", None); + let result = rewrite_forward_request(raw, raw.len(), "/p", None).expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!( !result_str @@ -2765,7 +2785,7 @@ mod tests { #[test] fn test_rewrite_replaces_connection_header() { let raw = b"GET http://host/p HTTP/1.1\r\nHost: host\r\nConnection: keep-alive\r\n\r\n"; - let result = rewrite_forward_request(raw, raw.len(), "/p", None); + let result = rewrite_forward_request(raw, raw.len(), "/p", None).expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!(result_str.contains("Connection: close")); assert!(!result_str.contains("keep-alive")); @@ -2774,7 +2794,7 @@ mod tests { #[test] fn test_rewrite_preserves_body_overflow() { let raw = b"POST http://host/api HTTP/1.1\r\nHost: host\r\nContent-Length: 13\r\n\r\n{\"key\":\"val\"}"; - let result = rewrite_forward_request(raw, raw.len(), "/api", None); + let result = rewrite_forward_request(raw, raw.len(), "/api", None).expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!(result_str.contains("{\"key\":\"val\"}")); assert!(result_str.contains("POST /api HTTP/1.1")); @@ -2783,7 +2803,7 @@ mod tests { #[test] fn test_rewrite_preserves_existing_via() { let raw = b"GET http://host/p HTTP/1.1\r\nHost: host\r\nVia: 1.0 upstream\r\n\r\n"; - let result = rewrite_forward_request(raw, raw.len(), "/p", None); + let result = rewrite_forward_request(raw, raw.len(), "/p", None).expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!(result_str.contains("Via: 1.0 upstream")); // Should not add a second Via header @@ -2798,7 +2818,8 @@ mod tests { .collect(), ); let raw = b"GET http://host/p HTTP/1.1\r\nHost: host\r\nAuthorization: Bearer openshell:resolve:env:ANTHROPIC_API_KEY\r\n\r\n"; - let result = rewrite_forward_request(raw, raw.len(), "/p", resolver.as_ref()); + let result = rewrite_forward_request(raw, raw.len(), "/p", resolver.as_ref()) + .expect("should succeed"); let result_str = String::from_utf8_lossy(&result); assert!(result_str.contains("Authorization: Bearer sk-test")); assert!(!result_str.contains("openshell:resolve:env:ANTHROPIC_API_KEY")); diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs index 4ee1ee846..88b84831e 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-sandbox/src/secrets.rs @@ -1,10 +1,66 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use base64::Engine as _; use std::collections::HashMap; +use std::fmt; const PLACEHOLDER_PREFIX: &str = "openshell:resolve:env:"; +/// Public access to the placeholder prefix for fail-closed scanning in other modules. +pub(crate) const PLACEHOLDER_PREFIX_PUBLIC: &str = PLACEHOLDER_PREFIX; + +/// Characters that are valid in an env var key name (used to extract +/// placeholder boundaries within concatenated strings like path segments). +fn is_env_key_char(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' +} + +// --------------------------------------------------------------------------- +// Error and result types +// --------------------------------------------------------------------------- + +/// Error returned when a placeholder cannot be resolved or a resolved secret +/// contains prohibited characters. +#[derive(Debug)] +pub(crate) struct UnresolvedPlaceholderError { + pub location: &'static str, // "header", "query_param", "path" +} + +impl fmt::Display for UnresolvedPlaceholderError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "unresolved credential placeholder in {}: detected openshell:resolve:env:* token that could not be resolved", + self.location + ) + } +} + +/// Result of rewriting an HTTP header block with credential resolution. +#[derive(Debug)] +pub(crate) struct RewriteResult { + /// The rewritten HTTP bytes (headers + body overflow). + pub rewritten: Vec, + /// A redacted version of the request target for logging. + /// Contains `[CREDENTIAL]` in place of resolved credential values. + /// `None` if the target was not modified. + pub redacted_target: Option, +} + +/// Result of rewriting a request target for OPA evaluation. +#[derive(Debug)] +pub(crate) struct RewriteTargetResult { + /// The resolved target (real secrets) — for upstream forwarding only. + pub resolved: String, + /// The redacted target (`[CREDENTIAL]` in place of secrets) — for OPA + logs. + pub redacted: String, +} + +// --------------------------------------------------------------------------- +// SecretResolver +// --------------------------------------------------------------------------- + #[derive(Debug, Clone, Default)] pub(crate) struct SecretResolver { by_placeholder: HashMap, @@ -30,45 +86,513 @@ impl SecretResolver { (child_env, Some(Self { by_placeholder })) } + /// Resolve a placeholder string to the real secret value. + /// + /// Returns `None` if the placeholder is unknown or the resolved value + /// contains prohibited control characters (CRLF, null byte). pub(crate) fn resolve_placeholder(&self, value: &str) -> Option<&str> { - self.by_placeholder.get(value).map(String::as_str) + let secret = self.by_placeholder.get(value).map(String::as_str)?; + match validate_resolved_secret(secret) { + Ok(s) => Some(s), + Err(reason) => { + tracing::warn!( + location = "resolve_placeholder", + reason, + "credential resolution rejected: resolved value contains prohibited characters" + ); + None + } + } } pub(crate) fn rewrite_header_value(&self, value: &str) -> Option { + // Direct placeholder match: `x-api-key: openshell:resolve:env:KEY` if let Some(secret) = self.resolve_placeholder(value.trim()) { return Some(secret.to_string()); } let trimmed = value.trim(); + + // Basic auth decoding: `Basic ` where the decoded content + // contains a placeholder (e.g. `user:openshell:resolve:env:PASS`). + if let Some(encoded) = trimmed + .strip_prefix("Basic ") + .or_else(|| trimmed.strip_prefix("basic ")) + .map(str::trim) + { + if let Some(rewritten) = self.rewrite_basic_auth_token(encoded) { + return Some(format!("Basic {rewritten}")); + } + } + + // Prefixed placeholder: `Bearer openshell:resolve:env:KEY` let split_at = trimmed.find(char::is_whitespace)?; let prefix = &trimmed[..split_at]; let candidate = trimmed[split_at..].trim(); let secret = self.resolve_placeholder(candidate)?; Some(format!("{prefix} {secret}")) } + + /// Decode a Base64-encoded Basic auth token, resolve any placeholders in + /// the decoded `username:password` string, and re-encode. + /// + /// Returns `None` if decoding fails or no placeholders are found. + fn rewrite_basic_auth_token(&self, encoded: &str) -> Option { + let b64 = base64::engine::general_purpose::STANDARD; + let decoded_bytes = b64.decode(encoded.trim()).ok()?; + let decoded = std::str::from_utf8(&decoded_bytes).ok()?; + + // Check if the decoded string contains any placeholder + if !decoded.contains(PLACEHOLDER_PREFIX) { + return None; + } + + // Rewrite all placeholder occurrences in the decoded string + let mut rewritten = decoded.to_string(); + for (placeholder, secret) in &self.by_placeholder { + if rewritten.contains(placeholder.as_str()) { + // Validate the resolved secret for control characters + if validate_resolved_secret(secret).is_err() { + tracing::warn!( + location = "basic_auth", + "credential resolution rejected: resolved value contains prohibited characters" + ); + return None; + } + rewritten = rewritten.replace(placeholder.as_str(), secret); + } + } + + // Only return if we actually changed something + if rewritten == decoded { + return None; + } + + Some(b64.encode(rewritten.as_bytes())) + } } pub(crate) fn placeholder_for_env_key(key: &str) -> String { format!("{PLACEHOLDER_PREFIX}{key}") } -pub(crate) fn rewrite_http_header_block(raw: &[u8], resolver: Option<&SecretResolver>) -> Vec { +// --------------------------------------------------------------------------- +// Secret validation (F1 — CWE-113) +// --------------------------------------------------------------------------- + +/// Validate that a resolved secret value does not contain characters that +/// could enable HTTP header injection or request splitting. +fn validate_resolved_secret(value: &str) -> Result<&str, &'static str> { + if value + .bytes() + .any(|b| b == b'\r' || b == b'\n' || b == b'\0') + { + return Err("resolved secret contains prohibited control characters (CR, LF, or NUL)"); + } + Ok(value) +} + +// --------------------------------------------------------------------------- +// Percent encoding/decoding (RFC 3986) +// --------------------------------------------------------------------------- + +/// Percent-encode a string for safe use in URL query parameter values. +/// +/// Encodes all characters except unreserved characters (RFC 3986 Section 2.3): +/// ALPHA / DIGIT / "-" / "." / "_" / "~" +fn percent_encode_query(input: &str) -> String { + let mut encoded = String::with_capacity(input.len()); + for byte in input.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' => { + encoded.push(byte as char); + } + _ => { + use fmt::Write; + let _ = write!(encoded, "%{byte:02X}"); + } + } + } + encoded +} + +/// Percent-encode a string for safe use in URL path segments. +/// +/// RFC 3986 §3.3: pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +/// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" +/// +/// Must encode: `/`, `?`, `#`, space, and other non-pchar characters. +fn percent_encode_path_segment(input: &str) -> String { + let mut encoded = String::with_capacity(input.len()); + for byte in input.bytes() { + match byte { + // unreserved + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' => { + encoded.push(byte as char); + } + // sub-delims + ":" + "@" + b'!' | b'$' | b'&' | b'\'' | b'(' | b')' | b'*' | b'+' | b',' | b';' | b'=' | b':' + | b'@' => { + encoded.push(byte as char); + } + _ => { + use fmt::Write; + let _ = write!(encoded, "%{byte:02X}"); + } + } + } + encoded +} + +/// Percent-decode a URL-encoded string. +fn percent_decode(input: &str) -> String { + let mut decoded = Vec::with_capacity(input.len()); + let mut bytes = input.bytes(); + while let Some(b) = bytes.next() { + if b == b'%' { + let hi = bytes.next(); + let lo = bytes.next(); + if let (Some(h), Some(l)) = (hi, lo) { + let hex = [h, l]; + if let Ok(s) = std::str::from_utf8(&hex) { + if let Ok(val) = u8::from_str_radix(s, 16) { + decoded.push(val); + continue; + } + } + // Invalid percent encoding — preserve verbatim + decoded.push(b'%'); + decoded.push(h); + decoded.push(l); + } else { + decoded.push(b'%'); + if let Some(h) = hi { + decoded.push(h); + } + } + } else { + decoded.push(b); + } + } + String::from_utf8_lossy(&decoded).into_owned() +} + +// --------------------------------------------------------------------------- +// Path credential validation (F3 — CWE-22) +// --------------------------------------------------------------------------- + +/// Validate that a resolved credential value is safe for use in a URL path segment. +/// +/// Operates on the raw (decoded) credential value before percent-encoding. +/// Rejects values that could enable path traversal, request splitting, or +/// URI structure breakage. +fn validate_credential_for_path(value: &str) -> Result<(), String> { + if value.contains("../") || value.contains("..\\") || value == ".." { + return Err("credential contains path traversal sequence".into()); + } + if value.contains('\0') || value.contains('\r') || value.contains('\n') { + return Err("credential contains control character".into()); + } + if value.contains('/') || value.contains('\\') { + return Err("credential contains path separator".into()); + } + if value.contains('?') || value.contains('#') { + return Err("credential contains URI delimiter".into()); + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// URI rewriting +// --------------------------------------------------------------------------- + +/// Result of rewriting the request line. +struct RewriteLineResult { + /// The rewritten request line. + line: String, + /// Redacted target for logging (if any rewriting occurred). + redacted_target: Option, +} + +/// Rewrite credential placeholders in the request line's URL. +/// +/// Given a request line like `GET /bot{TOKEN}/path?key={APIKEY} HTTP/1.1`, +/// resolves placeholders in both path segments and query parameter values. +fn rewrite_request_line( + line: &str, + resolver: &SecretResolver, +) -> Result { + // Request line format: METHOD SP REQUEST-URI SP HTTP-VERSION + let mut parts = line.splitn(3, ' '); + let method = match parts.next() { + Some(m) => m, + None => { + return Ok(RewriteLineResult { + line: line.to_string(), + redacted_target: None, + }); + } + }; + let uri = match parts.next() { + Some(u) => u, + None => { + return Ok(RewriteLineResult { + line: line.to_string(), + redacted_target: None, + }); + } + }; + let version = match parts.next() { + Some(v) => v, + None => { + return Ok(RewriteLineResult { + line: line.to_string(), + redacted_target: None, + }); + } + }; + + // Only rewrite if the URI contains a placeholder + if !uri.contains(PLACEHOLDER_PREFIX) { + return Ok(RewriteLineResult { + line: line.to_string(), + redacted_target: None, + }); + } + + // Split URI into path and query + let (path, query) = match uri.split_once('?') { + Some((p, q)) => (p, Some(q)), + None => (uri, None), + }; + + // Rewrite path segments + let (resolved_path, redacted_path) = match rewrite_uri_path(path, resolver)? { + Some((resolved, redacted)) => (resolved, redacted), + None => (path.to_string(), path.to_string()), + }; + + // Rewrite query params + let (resolved_query, redacted_query) = match query { + Some(q) => match rewrite_uri_query_params(q, resolver)? { + Some((resolved, redacted)) => (Some(resolved), Some(redacted)), + None => (Some(q.to_string()), Some(q.to_string())), + }, + None => (None, None), + }; + + // Reassemble + let resolved_uri = match &resolved_query { + Some(q) => format!("{resolved_path}?{q}"), + None => resolved_path.clone(), + }; + let redacted_uri = match &redacted_query { + Some(q) => format!("{redacted_path}?{q}"), + None => redacted_path, + }; + + Ok(RewriteLineResult { + line: format!("{method} {resolved_uri} {version}"), + redacted_target: Some(redacted_uri), + }) +} + +/// Rewrite placeholders in URL path segments. +/// +/// Handles substring matching for cases like Telegram's `/bot{TOKEN}/method` +/// where the placeholder is concatenated with literal text in a segment. +/// +/// Returns `Some((resolved_path, redacted_path))` if any placeholders were found, +/// `None` if no placeholders exist in the path. +fn rewrite_uri_path( + path: &str, + resolver: &SecretResolver, +) -> Result, UnresolvedPlaceholderError> { + if !path.contains(PLACEHOLDER_PREFIX) { + return Ok(None); + } + + let segments: Vec<&str> = path.split('/').collect(); + let mut resolved_segments = Vec::with_capacity(segments.len()); + let mut redacted_segments = Vec::with_capacity(segments.len()); + let mut any_rewritten = false; + + for segment in &segments { + let decoded = percent_decode(segment); + if !decoded.contains(PLACEHOLDER_PREFIX) { + resolved_segments.push(segment.to_string()); + redacted_segments.push(segment.to_string()); + continue; + } + + let (resolved, redacted) = rewrite_path_segment(&decoded, resolver)?; + // Percent-encode the resolved segment for path context + resolved_segments.push(percent_encode_path_segment(&resolved)); + redacted_segments.push(redacted); + any_rewritten = true; + } + + if !any_rewritten { + return Ok(None); + } + + Ok(Some(( + resolved_segments.join("/"), + redacted_segments.join("/"), + ))) +} + +/// Rewrite placeholders within a single path segment (already percent-decoded). +/// +/// Uses the placeholder grammar `openshell:resolve:env:[A-Za-z_][A-Za-z0-9_]*` +/// to determine placeholder boundaries within concatenated text. +fn rewrite_path_segment( + segment: &str, + resolver: &SecretResolver, +) -> Result<(String, String), UnresolvedPlaceholderError> { + let mut resolved = String::with_capacity(segment.len()); + let mut redacted = String::with_capacity(segment.len()); + let mut pos = 0; + let bytes = segment.as_bytes(); + + while pos < bytes.len() { + if let Some(start) = segment[pos..].find(PLACEHOLDER_PREFIX) { + let abs_start = pos + start; + // Copy literal prefix before the placeholder + resolved.push_str(&segment[pos..abs_start]); + redacted.push_str(&segment[pos..abs_start]); + + // Extract the key name using the env var grammar: [A-Za-z_][A-Za-z0-9_]* + let key_start = abs_start + PLACEHOLDER_PREFIX.len(); + let key_end = segment[key_start..] + .bytes() + .position(|b| !is_env_key_char(b)) + .map_or(segment.len(), |p| key_start + p); + + if key_end == key_start { + // Empty key — not a valid placeholder, copy literally + resolved.push_str(&segment[abs_start..abs_start + PLACEHOLDER_PREFIX.len()]); + redacted.push_str(&segment[abs_start..abs_start + PLACEHOLDER_PREFIX.len()]); + pos = abs_start + PLACEHOLDER_PREFIX.len(); + continue; + } + + let full_placeholder = &segment[abs_start..key_end]; + if let Some(secret) = resolver.resolve_placeholder(full_placeholder) { + validate_credential_for_path(secret).map_err(|reason| { + tracing::warn!( + location = "path", + %reason, + "credential resolution rejected: resolved value unsafe for path" + ); + UnresolvedPlaceholderError { location: "path" } + })?; + resolved.push_str(secret); + redacted.push_str("[CREDENTIAL]"); + } else { + return Err(UnresolvedPlaceholderError { location: "path" }); + } + pos = key_end; + } else { + // No more placeholders in remainder + resolved.push_str(&segment[pos..]); + redacted.push_str(&segment[pos..]); + break; + } + } + + Ok((resolved, redacted)) +} + +/// Rewrite placeholders in query parameter values. +/// +/// Returns `Some((resolved_query, redacted_query))` if any placeholders were found. +fn rewrite_uri_query_params( + query: &str, + resolver: &SecretResolver, +) -> Result, UnresolvedPlaceholderError> { + if !query.contains(PLACEHOLDER_PREFIX) { + return Ok(None); + } + + let mut resolved_params = Vec::new(); + let mut redacted_params = Vec::new(); + let mut any_rewritten = false; + + for param in query.split('&') { + if let Some((key, value)) = param.split_once('=') { + let decoded_value = percent_decode(value); + if let Some(secret) = resolver.resolve_placeholder(&decoded_value) { + resolved_params.push(format!("{key}={}", percent_encode_query(secret))); + redacted_params.push(format!("{key}=[CREDENTIAL]")); + any_rewritten = true; + } else if decoded_value.contains(PLACEHOLDER_PREFIX) { + // Placeholder detected but not resolved + return Err(UnresolvedPlaceholderError { + location: "query_param", + }); + } else { + resolved_params.push(param.to_string()); + redacted_params.push(param.to_string()); + } + } else { + resolved_params.push(param.to_string()); + redacted_params.push(param.to_string()); + } + } + + if !any_rewritten { + return Ok(None); + } + + Ok(Some((resolved_params.join("&"), redacted_params.join("&")))) +} + +// --------------------------------------------------------------------------- +// Public rewrite API +// --------------------------------------------------------------------------- + +/// Rewrite credential placeholders in an HTTP header block. +/// +/// Resolves `openshell:resolve:env:*` placeholders in the request line +/// (path segments and query parameter values), header values (including +/// Basic auth tokens), and returns a `RewriteResult` with the rewritten +/// bytes and a redacted target for logging. +/// +/// Returns `Err` if any placeholder is detected but cannot be resolved +/// (fail-closed behavior). +pub(crate) fn rewrite_http_header_block( + raw: &[u8], + resolver: Option<&SecretResolver>, +) -> Result { let Some(resolver) = resolver else { - return raw.to_vec(); + return Ok(RewriteResult { + rewritten: raw.to_vec(), + redacted_target: None, + }); }; let Some(header_end) = raw.windows(4).position(|w| w == b"\r\n\r\n").map(|p| p + 4) else { - return raw.to_vec(); + return Ok(RewriteResult { + rewritten: raw.to_vec(), + redacted_target: None, + }); }; let header_str = String::from_utf8_lossy(&raw[..header_end]); let mut lines = header_str.split("\r\n"); let Some(request_line) = lines.next() else { - return raw.to_vec(); + return Ok(RewriteResult { + rewritten: raw.to_vec(), + redacted_target: None, + }); }; + // Rewrite request line (path + query params) + let rl_result = rewrite_request_line(request_line, resolver)?; + let mut output = Vec::with_capacity(raw.len()); - output.extend_from_slice(request_line.as_bytes()); + output.extend_from_slice(rl_result.line.as_bytes()); output.extend_from_slice(b"\r\n"); for line in lines { @@ -82,7 +606,25 @@ pub(crate) fn rewrite_http_header_block(raw: &[u8], resolver: Option<&SecretReso output.extend_from_slice(b"\r\n"); output.extend_from_slice(&raw[header_end..]); - output + + // Fail-closed scan: check for any remaining unresolved placeholders + // in both raw form and percent-decoded form of the output header block. + let output_header = String::from_utf8_lossy(&output[..output.len().min(header_end + 256)]); + if output_header.contains(PLACEHOLDER_PREFIX) { + return Err(UnresolvedPlaceholderError { location: "header" }); + } + + // Also check percent-decoded form of the request line (F5 — encoded placeholder bypass) + let rewritten_rl = output_header.split("\r\n").next().unwrap_or(""); + let decoded_rl = percent_decode(rewritten_rl); + if decoded_rl.contains(PLACEHOLDER_PREFIX) { + return Err(UnresolvedPlaceholderError { location: "path" }); + } + + Ok(RewriteResult { + rewritten: output, + redacted_target: rl_result.redacted_target, + }) } pub(crate) fn rewrite_header_line(line: &str, resolver: &SecretResolver) -> String { @@ -96,10 +638,68 @@ pub(crate) fn rewrite_header_line(line: &str, resolver: &SecretResolver) -> Stri } } +/// Resolve placeholders in a request target (path + query) for OPA evaluation. +/// +/// Returns the resolved target (real secrets, for upstream) and a redacted +/// version (`[CREDENTIAL]` in place of secrets, for OPA input and logs). +pub(crate) fn rewrite_target_for_eval( + target: &str, + resolver: &SecretResolver, +) -> Result { + if !target.contains(PLACEHOLDER_PREFIX) { + // Also check percent-decoded form + let decoded = percent_decode(target); + if decoded.contains(PLACEHOLDER_PREFIX) { + return Err(UnresolvedPlaceholderError { location: "path" }); + } + return Ok(RewriteTargetResult { + resolved: target.to_string(), + redacted: target.to_string(), + }); + } + + let (path, query) = match target.split_once('?') { + Some((p, q)) => (p, Some(q)), + None => (target, None), + }; + + // Rewrite path + let (resolved_path, redacted_path) = match rewrite_uri_path(path, resolver)? { + Some((resolved, redacted)) => (resolved, redacted), + None => (path.to_string(), path.to_string()), + }; + + // Rewrite query + let (resolved_query, redacted_query) = match query { + Some(q) => match rewrite_uri_query_params(q, resolver)? { + Some((resolved, redacted)) => (Some(resolved), Some(redacted)), + None => (Some(q.to_string()), Some(q.to_string())), + }, + None => (None, None), + }; + + let resolved = match &resolved_query { + Some(q) => format!("{resolved_path}?{q}"), + None => resolved_path, + }; + let redacted = match &redacted_query { + Some(q) => format!("{redacted_path}?{q}"), + None => redacted_path, + }; + + Ok(RewriteTargetResult { resolved, redacted }) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + #[cfg(test)] mod tests { use super::*; + // === Existing tests (preserved) === + #[test] fn provider_env_is_replaced_with_placeholders() { let (child_env, resolver) = SecretResolver::from_provider_env( @@ -163,17 +763,13 @@ mod tests { ); let raw = b"POST /v1 HTTP/1.1\r\nAuthorization: Bearer openshell:resolve:env:CUSTOM_TOKEN\r\nContent-Length: 5\r\n\r\nhello"; - let rewritten = rewrite_http_header_block(raw, resolver.as_ref()); - let rewritten = String::from_utf8(rewritten).expect("utf8"); + let result = rewrite_http_header_block(raw, resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); assert!(rewritten.contains("Authorization: Bearer secret-token\r\n")); assert!(rewritten.ends_with("\r\n\r\nhello")); } - /// Simulates the full round-trip: provider env → child placeholders → - /// HTTP headers → rewrite. This is the exact flow that occurs when a - /// sandbox child process reads placeholder env vars, constructs an HTTP - /// request, and the proxy rewrites headers before forwarding upstream. #[test] fn full_round_trip_child_env_to_rewritten_headers() { let provider_env: HashMap = [ @@ -191,13 +787,11 @@ mod tests { let (child_env, resolver) = SecretResolver::from_provider_env(provider_env); - // Child process reads placeholders from the environment let auth_value = child_env.get("ANTHROPIC_API_KEY").unwrap(); let token_value = child_env.get("CUSTOM_SERVICE_TOKEN").unwrap(); assert!(auth_value.starts_with(PLACEHOLDER_PREFIX)); assert!(token_value.starts_with(PLACEHOLDER_PREFIX)); - // Child constructs an HTTP request using those placeholders let raw = format!( "GET /v1/messages HTTP/1.1\r\n\ Host: api.example.com\r\n\ @@ -206,11 +800,10 @@ mod tests { Content-Length: 0\r\n\r\n" ); - // Proxy rewrites headers - let rewritten = rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()); - let rewritten = String::from_utf8(rewritten).expect("utf8"); + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); - // Real secrets must appear in the rewritten headers assert!( rewritten.contains("Authorization: Bearer sk-real-key-12345\r\n"), "Expected rewritten Authorization header, got: {rewritten}" @@ -219,14 +812,10 @@ mod tests { rewritten.contains("x-api-key: tok-real-svc-67890\r\n"), "Expected rewritten x-api-key header, got: {rewritten}" ); - - // Placeholders must not appear assert!( !rewritten.contains("openshell:resolve:env:"), "Placeholder leaked into rewritten request: {rewritten}" ); - - // Request line and non-secret headers must be preserved assert!(rewritten.starts_with("GET /v1/messages HTTP/1.1\r\n")); assert!(rewritten.contains("Host: api.example.com\r\n")); assert!(rewritten.contains("Content-Length: 0\r\n")); @@ -241,9 +830,8 @@ mod tests { ); let raw = b"GET / HTTP/1.1\r\nHost: example.com\r\nAccept: application/json\r\nContent-Type: text/plain\r\n\r\n"; - let rewritten = rewrite_http_header_block(raw, resolver.as_ref()); - // The output should be byte-identical since no placeholders are present - assert_eq!(raw.as_slice(), rewritten.as_slice()); + let result = rewrite_http_header_block(raw, resolver.as_ref()).expect("should succeed"); + assert_eq!(raw.as_slice(), result.rewritten.as_slice()); } #[test] @@ -256,7 +844,633 @@ mod tests { #[test] fn rewrite_with_no_resolver_returns_original() { let raw = b"GET / HTTP/1.1\r\nAuthorization: Bearer my-token\r\n\r\n"; - let rewritten = rewrite_http_header_block(raw, None); - assert_eq!(raw.as_slice(), rewritten.as_slice()); + let result = rewrite_http_header_block(raw, None).expect("should succeed"); + assert_eq!(raw.as_slice(), result.rewritten.as_slice()); + } + + // === Secret validation tests (F1 — CWE-113) === + + #[test] + fn resolve_placeholder_rejects_crlf() { + let (_, resolver) = SecretResolver::from_provider_env( + [("BAD_KEY".to_string(), "value\r\nEvil: header".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + assert!( + resolver + .resolve_placeholder("openshell:resolve:env:BAD_KEY") + .is_none() + ); + } + + #[test] + fn resolve_placeholder_rejects_null() { + let (_, resolver) = SecretResolver::from_provider_env( + [("BAD_KEY".to_string(), "value\0rest".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + assert!( + resolver + .resolve_placeholder("openshell:resolve:env:BAD_KEY") + .is_none() + ); + } + + #[test] + fn resolve_placeholder_accepts_normal_values() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "sk-abc123_DEF.456~xyz".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + assert_eq!( + resolver.resolve_placeholder("openshell:resolve:env:KEY"), + Some("sk-abc123_DEF.456~xyz") + ); + } + + // === Query parameter rewriting tests (absorbed from PR #631) === + + #[test] + fn rewrites_query_param_placeholder_in_request_line() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("YOUTUBE_API_KEY".to_string(), "AIzaSy-secret".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("YOUTUBE_API_KEY").unwrap(); + + let raw = format!( + "GET /youtube/v3/search?part=snippet&key={placeholder} HTTP/1.1\r\n\ + Host: www.googleapis.com\r\n\r\n" + ); + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten + .starts_with("GET /youtube/v3/search?part=snippet&key=AIzaSy-secret HTTP/1.1\r\n"), + "Expected query param rewritten, got: {rewritten}" + ); + assert!(!rewritten.contains("openshell:resolve:env:")); + } + + #[test] + fn rewrites_query_param_with_special_chars_percent_encoded() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [( + "API_KEY".to_string(), + "key with spaces&symbols=yes".to_string(), + )] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("API_KEY").unwrap(); + + let raw = format!("GET /api?token={placeholder} HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.contains("token=key%20with%20spaces%26symbols%3Dyes"), + "Expected percent-encoded secret, got: {rewritten}" + ); + } + + #[test] + fn rewrites_query_param_only_placeholder_first_param() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret123".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("KEY").unwrap(); + + let raw = format!("GET /api?key={placeholder}&format=json HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.starts_with("GET /api?key=secret123&format=json HTTP/1.1"), + "Expected first param rewritten, got: {rewritten}" + ); + } + + #[test] + fn no_query_param_rewrite_without_placeholder() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + let raw = b"GET /api?key=normalvalue HTTP/1.1\r\nHost: x\r\n\r\n"; + let result = rewrite_http_header_block(raw, resolver.as_ref()).expect("should succeed"); + assert_eq!(raw.as_slice(), result.rewritten.as_slice()); + } + + // === Basic Authorization header encoding tests (absorbed from PR #631) === + + #[test] + fn rewrites_basic_auth_placeholder_in_decoded_token() { + let b64 = base64::engine::general_purpose::STANDARD; + + let (child_env, resolver) = SecretResolver::from_provider_env( + [("DB_PASSWORD".to_string(), "s3cret!".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("DB_PASSWORD").unwrap(); + + let credentials = format!("admin:{placeholder}"); + let encoded = b64.encode(credentials.as_bytes()); + + let header_line = format!("Authorization: Basic {encoded}"); + let rewritten = rewrite_header_line(&header_line, &resolver); + + let rewritten_token = rewritten.strip_prefix("Authorization: Basic ").unwrap(); + let decoded = b64.decode(rewritten_token).unwrap(); + let decoded_str = std::str::from_utf8(&decoded).unwrap(); + + assert_eq!(decoded_str, "admin:s3cret!"); + assert!(!rewritten.contains("openshell:resolve:env:")); + } + + #[test] + fn basic_auth_without_placeholder_unchanged() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + + let b64 = base64::engine::general_purpose::STANDARD; + let encoded = b64.encode(b"user:password"); + let header_line = format!("Authorization: Basic {encoded}"); + + let rewritten = rewrite_header_line(&header_line, &resolver); + assert_eq!( + rewritten, header_line, + "Should not modify non-placeholder Basic auth" + ); + } + + #[test] + fn basic_auth_full_round_trip_header_block() { + let b64 = base64::engine::general_purpose::STANDARD; + + let (child_env, resolver) = SecretResolver::from_provider_env( + [("REGISTRY_PASS".to_string(), "hunter2".to_string())] + .into_iter() + .collect(), + ); + let placeholder = child_env.get("REGISTRY_PASS").unwrap(); + let credentials = format!("deploy:{placeholder}"); + let encoded = b64.encode(credentials.as_bytes()); + + let raw = format!( + "GET /v2/_catalog HTTP/1.1\r\n\ + Host: registry.example.com\r\n\ + Authorization: Basic {encoded}\r\n\ + Accept: application/json\r\n\r\n" + ); + + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + let auth_line = rewritten + .lines() + .find(|l| l.starts_with("Authorization:")) + .unwrap(); + let token = auth_line.strip_prefix("Authorization: Basic ").unwrap(); + let decoded = b64.decode(token).unwrap(); + assert_eq!(std::str::from_utf8(&decoded).unwrap(), "deploy:hunter2"); + + assert!(rewritten.contains("Host: registry.example.com\r\n")); + assert!(rewritten.contains("Accept: application/json\r\n")); + assert!(!rewritten.contains("openshell:resolve:env:")); + } + + // === Percent encoding tests (absorbed from PR #631) === + + #[test] + fn percent_encode_preserves_unreserved() { + assert_eq!(percent_encode_query("abc123-._~"), "abc123-._~"); + } + + #[test] + fn percent_encode_encodes_special_chars() { + assert_eq!(percent_encode_query("a b"), "a%20b"); + assert_eq!(percent_encode_query("key=val&x"), "key%3Dval%26x"); + } + + #[test] + fn percent_decode_round_trips() { + let original = "hello world & more=stuff"; + let encoded = percent_encode_query(original); + let decoded = percent_decode(&encoded); + assert_eq!(decoded, original); + } + + // === URL path rewriting tests === + + #[test] + fn rewrite_path_single_segment_placeholder() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("TOKEN".to_string(), "abc123".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("TOKEN").unwrap(); + + let raw = format!("GET /api/{placeholder}/data HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), Some(&resolver)).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.starts_with("GET /api/abc123/data HTTP/1.1"), + "Expected path rewritten, got: {rewritten}" + ); + assert_eq!( + result.redacted_target.as_deref(), + Some("/api/[CREDENTIAL]/data") + ); + } + + #[test] + fn rewrite_path_telegram_style_concatenated() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [( + "TELEGRAM_TOKEN".to_string(), + "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11".to_string(), + )] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("TELEGRAM_TOKEN").unwrap(); + + let raw = format!( + "POST /bot{placeholder}/sendMessage HTTP/1.1\r\nHost: api.telegram.org\r\n\r\n" + ); + let result = + rewrite_http_header_block(raw.as_bytes(), Some(&resolver)).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.starts_with( + "POST /bot123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11/sendMessage HTTP/1.1" + ), + "Expected Telegram-style path rewritten, got: {rewritten}" + ); + assert_eq!( + result.redacted_target.as_deref(), + Some("/bot[CREDENTIAL]/sendMessage") + ); + } + + #[test] + fn rewrite_path_multiple_placeholders_in_separate_segments() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [ + ("ORG_ID".to_string(), "org-123".to_string()), + ("API_KEY".to_string(), "key-456".to_string()), + ] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let org_ph = child_env.get("ORG_ID").unwrap(); + let key_ph = child_env.get("API_KEY").unwrap(); + + let raw = format!("GET /orgs/{org_ph}/keys/{key_ph} HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), Some(&resolver)).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.starts_with("GET /orgs/org-123/keys/key-456 HTTP/1.1"), + "Expected both path segments rewritten, got: {rewritten}" + ); + } + + #[test] + fn rewrite_path_no_placeholders_unchanged() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + let raw = b"GET /v1/chat/completions HTTP/1.1\r\nHost: x\r\n\r\n"; + let result = rewrite_http_header_block(raw, resolver.as_ref()).expect("should succeed"); + assert_eq!(raw.as_slice(), result.rewritten.as_slice()); + assert!(result.redacted_target.is_none()); + } + + #[test] + fn rewrite_path_preserves_query_params() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("TOKEN".to_string(), "tok123".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("TOKEN").unwrap(); + + let raw = format!("GET /bot{placeholder}/method?format=json HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), Some(&resolver)).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.starts_with("GET /bottok123/method?format=json HTTP/1.1"), + "Expected path rewritten and query preserved, got: {rewritten}" + ); + } + + #[test] + fn rewrite_path_credential_traversal_rejected() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("BAD".to_string(), "../admin".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("BAD").unwrap(); + + let raw = format!("GET /api/{placeholder}/data HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = rewrite_http_header_block(raw.as_bytes(), Some(&resolver)); + assert!( + result.is_err(), + "Path traversal credential should be rejected" + ); + } + + #[test] + fn rewrite_path_credential_backslash_rejected() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("BAD".to_string(), "foo\\bar".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("BAD").unwrap(); + + let raw = format!("GET /api/{placeholder} HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = rewrite_http_header_block(raw.as_bytes(), Some(&resolver)); + assert!( + result.is_err(), + "Backslash in credential should be rejected" + ); + } + + #[test] + fn rewrite_path_credential_slash_rejected() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("BAD".to_string(), "foo/bar".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("BAD").unwrap(); + + let raw = format!("GET /api/{placeholder} HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = rewrite_http_header_block(raw.as_bytes(), Some(&resolver)); + assert!( + result.is_err(), + "Slash in path credential should be rejected" + ); + } + + #[test] + fn rewrite_path_credential_null_rejected() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("BAD".to_string(), "foo\0bar".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("BAD").unwrap(); + + let raw = format!("GET /api/{placeholder} HTTP/1.1\r\nHost: x\r\n\r\n"); + // The null byte in the credential is caught by resolve_placeholder's + // validate_resolved_secret, which returns None. This triggers the + // unresolved placeholder path in rewrite_path_segment → fail-closed. + let result = rewrite_http_header_block(raw.as_bytes(), Some(&resolver)); + assert!( + result.is_err(), + "Null byte in credential should be rejected" + ); + } + + #[test] + fn rewrite_path_percent_encodes_special_chars() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("TOKEN".to_string(), "hello world".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("TOKEN").unwrap(); + + // Space in the credential should trigger path validation rejection + // since space is safe to encode but the credential also doesn't + // contain path-unsafe chars. Actually, space IS allowed (just encoded). + // Let's test with a safe value that just needs encoding. + let raw = format!("GET /api/{placeholder}/data HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = + rewrite_http_header_block(raw.as_bytes(), Some(&resolver)).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!( + rewritten.contains("/api/hello%20world/data"), + "Expected percent-encoded path segment, got: {rewritten}" + ); + } + + // === Fail-closed tests === + + #[test] + fn unresolved_header_placeholder_returns_error() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + let raw = b"GET / HTTP/1.1\r\nx-api-key: openshell:resolve:env:UNKNOWN_KEY\r\n\r\n"; + let result = rewrite_http_header_block(raw, resolver.as_ref()); + assert!(result.is_err(), "Unresolved header placeholder should fail"); + } + + #[test] + fn unresolved_query_param_returns_error() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + let raw = b"GET /api?token=openshell:resolve:env:UNKNOWN HTTP/1.1\r\nHost: x\r\n\r\n"; + let result = rewrite_http_header_block(raw, resolver.as_ref()); + assert!( + result.is_err(), + "Unresolved query param placeholder should fail" + ); + } + + #[test] + fn unresolved_path_placeholder_returns_error() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + let raw = b"GET /api/openshell:resolve:env:UNKNOWN/data HTTP/1.1\r\nHost: x\r\n\r\n"; + let result = rewrite_http_header_block(raw, resolver.as_ref()); + assert!(result.is_err(), "Unresolved path placeholder should fail"); + } + + #[test] + fn percent_encoded_placeholder_in_path_caught() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + + // Percent-encode "openshell:resolve:env:UNKNOWN" in the path + let encoded_placeholder = "openshell%3Aresolve%3Aenv%3AUNKNOWN"; + let raw = format!("GET /api/{encoded_placeholder}/data HTTP/1.1\r\nHost: x\r\n\r\n"); + let result = rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()); + assert!( + result.is_err(), + "Percent-encoded placeholder should be caught by fail-closed scan" + ); + } + + #[test] + fn all_resolved_succeeds() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [ + ("KEY1".to_string(), "secret1".to_string()), + ("KEY2".to_string(), "secret2".to_string()), + ] + .into_iter() + .collect(), + ); + let ph1 = child_env.get("KEY1").unwrap(); + let ph2 = child_env.get("KEY2").unwrap(); + + let raw = format!( + "GET /api/{ph1}?token={ph2} HTTP/1.1\r\n\ + x-auth: {ph1}\r\n\r\n" + ); + let result = + rewrite_http_header_block(raw.as_bytes(), resolver.as_ref()).expect("should succeed"); + let rewritten = String::from_utf8(result.rewritten).expect("utf8"); + + assert!(!rewritten.contains("openshell:resolve:env:")); + assert!(rewritten.contains("secret1")); + assert!(rewritten.contains("secret2")); + } + + #[test] + fn no_resolver_passes_through_without_scanning() { + // Even if placeholders are present, None resolver means no scanning + let raw = b"GET /api/openshell:resolve:env:KEY HTTP/1.1\r\nHost: x\r\n\r\n"; + let result = rewrite_http_header_block(raw, None).expect("should succeed"); + assert_eq!(raw.as_slice(), result.rewritten.as_slice()); + } + + // === Redaction tests === + + #[test] + fn redacted_target_replaces_path_secrets_with_credential_marker() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("TOKEN".to_string(), "real-secret".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("TOKEN").unwrap(); + + let result = rewrite_target_for_eval(&format!("/bot{placeholder}/sendMessage"), &resolver) + .expect("should succeed"); + + assert_eq!(result.redacted, "/bot[CREDENTIAL]/sendMessage"); + assert!(result.resolved.contains("real-secret")); + assert!(!result.redacted.contains("real-secret")); + } + + #[test] + fn redacted_target_replaces_query_secrets_with_credential_marker() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret123".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let placeholder = child_env.get("KEY").unwrap(); + + let result = + rewrite_target_for_eval(&format!("/api?key={placeholder}&format=json"), &resolver) + .expect("should succeed"); + + assert_eq!(result.redacted, "/api?key=[CREDENTIAL]&format=json"); + assert!(result.resolved.contains("secret123")); + assert!(!result.redacted.contains("secret123")); + } + + #[test] + fn redacted_target_preserves_non_secret_segments() { + let (_, resolver) = SecretResolver::from_provider_env( + [("KEY".to_string(), "secret".to_string())] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + + let result = rewrite_target_for_eval("/v1/chat/completions?format=json", &resolver) + .expect("should succeed"); + + assert_eq!(result.resolved, "/v1/chat/completions?format=json"); + assert_eq!(result.redacted, "/v1/chat/completions?format=json"); + } + + #[test] + fn rewrite_target_for_eval_roundtrip() { + let (child_env, resolver) = SecretResolver::from_provider_env( + [ + ("TOKEN".to_string(), "tok123".to_string()), + ("KEY".to_string(), "key456".to_string()), + ] + .into_iter() + .collect(), + ); + let resolver = resolver.expect("resolver"); + let tok_ph = child_env.get("TOKEN").unwrap(); + let key_ph = child_env.get("KEY").unwrap(); + + let target = format!("/bot{tok_ph}/method?key={key_ph}"); + let result = rewrite_target_for_eval(&target, &resolver).expect("should succeed"); + + assert_eq!(result.resolved, "/bottok123/method?key=key456"); + assert_eq!(result.redacted, "/bot[CREDENTIAL]/method?key=[CREDENTIAL]"); } } diff --git a/docs/sandboxes/manage-providers.md b/docs/sandboxes/manage-providers.md index bcab48307..6d35766bf 100644 --- a/docs/sandboxes/manage-providers.md +++ b/docs/sandboxes/manage-providers.md @@ -124,6 +124,47 @@ $ openshell sandbox create -- claude This detects `claude` as a known tool, finds your `ANTHROPIC_API_KEY`, creates a provider, attaches it to the sandbox, and launches Claude Code. +## How Credential Injection Works + +The agent process inside the sandbox never sees real credential values. At startup, the proxy replaces each credential with an opaque placeholder token in the agent's environment. When the agent sends an HTTP request containing a placeholder, the proxy resolves it to the real credential before forwarding upstream. + +This resolution requires the proxy to see plaintext HTTP. Endpoints must use `protocol: rest` in the policy (which auto-terminates TLS) or explicit `tls: terminate`. Endpoints without TLS termination pass traffic through as an opaque stream, and credential placeholders are forwarded unresolved. + +### Supported injection locations + +The proxy resolves credential placeholders in the following parts of an HTTP request: + +| Location | How the agent uses it | Example | +|---|---|---| +| Header value | Agent reads `$API_KEY` from env and places it in a header. | `Authorization: Bearer ` | +| Header value (Basic auth) | Agent base64-encodes `user:` in an `Authorization: Basic` header. The proxy decodes, resolves, and re-encodes. | `Authorization: Basic ` | +| Query parameter value | Agent places the placeholder in a URL query parameter. | `GET /api?key=` | +| URL path segment | Agent builds a URL with the placeholder in the path. Supports concatenated patterns. | `POST /bot/sendMessage` | + +The proxy does not modify request bodies, cookies, or response content. + +### Fail-closed behavior + +If the proxy detects a credential placeholder in a request but cannot resolve it, it rejects the request with HTTP 500 instead of forwarding the raw placeholder to the upstream server. This prevents accidental credential leakage in server logs or error responses. + +### Example: Telegram Bot API (path-based credential) + +Create a provider with the Telegram bot token: + +```console +$ openshell provider create --name telegram --type generic --credential TELEGRAM_BOT_TOKEN=123456:ABC-DEF +``` + +The agent reads `TELEGRAM_BOT_TOKEN` from its environment and builds a request like `POST /bot/sendMessage`. The proxy resolves the placeholder in the URL path and forwards `POST /bot123456:ABC-DEF/sendMessage` to the upstream. + +### Example: Google API (query parameter credential) + +```console +$ openshell provider create --name google --type generic --credential YOUTUBE_API_KEY=AIzaSy-secret +``` + +The agent sends `GET /youtube/v3/search?part=snippet&key=`. The proxy resolves the placeholder in the query parameter value and percent-encodes the result before forwarding. + ## Supported Provider Types The following provider types are supported. From 9c8d6c714d44323e9568d6eba71a3e9f2d49ccb8 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Tue, 31 Mar 2026 19:53:46 -0700 Subject: [PATCH 26/45] fix(sandbox): eliminate Box::leak memory leak in rewrite_forward_request (#715) Remove Box::leak usage that permanently leaked one String allocation per forward proxy request. Write the rewritten request line directly to the output buffer during iteration instead of mutating a Vec<&str> element. Closes #709 Co-authored-by: John Myers --- crates/openshell-sandbox/src/proxy.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index ce980a45c..a7df76e2f 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -1624,16 +1624,7 @@ fn rewrite_forward_request( .map_or(used, |p| p + 4); let header_str = String::from_utf8_lossy(&raw[..header_end]); - let mut lines = header_str.split("\r\n").collect::>(); - - // Rewrite request line: METHOD absolute-uri HTTP/1.1 → METHOD path HTTP/1.1 - if let Some(first_line) = lines.first_mut() { - let parts: Vec<&str> = first_line.splitn(3, ' ').collect(); - if parts.len() == 3 { - let new_line = format!("{} {} {}", parts[0], path, parts[2]); - *first_line = Box::leak(new_line.into_boxed_str()); // safe: short-lived - } - } + let lines = header_str.split("\r\n").collect::>(); // Rebuild headers, stripping hop-by-hop and adding proxy headers let mut output = Vec::with_capacity(header_end + 128); @@ -1642,8 +1633,17 @@ fn rewrite_forward_request( for (i, line) in lines.iter().enumerate() { if i == 0 { - // Request line — already rewritten - output.extend_from_slice(line.as_bytes()); + // Rewrite request line: METHOD absolute-uri HTTP/1.1 → METHOD path HTTP/1.1 + let parts: Vec<&str> = line.splitn(3, ' ').collect(); + if parts.len() == 3 { + output.extend_from_slice(parts[0].as_bytes()); + output.push(b' '); + output.extend_from_slice(path.as_bytes()); + output.push(b' '); + output.extend_from_slice(parts[2].as_bytes()); + } else { + output.extend_from_slice(line.as_bytes()); + } output.extend_from_slice(b"\r\n"); continue; } From 0ec5da8085c0cf0d2660eef291abca1589c2c30c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 22:07:09 -0700 Subject: [PATCH 27/45] chore(mise): use install_only_stripped precompiled Python flavor (#693) --- mise.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/mise.toml b/mise.toml index 115b13c05..d204f5316 100644 --- a/mise.toml +++ b/mise.toml @@ -10,6 +10,7 @@ redactions = ["*_TOKEN", "*_PASSWORD"] [settings] experimental = true +python.precompiled_flavor = "install_only_stripped" [tools] python = "3.13.12" From fa3f79807e5652d92e3a3440621e08d80829d2bd Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Tue, 31 Mar 2026 22:07:41 -0700 Subject: [PATCH 28/45] fix(bootstrap): use append_path_with_name for tar paths exceeding 100 bytes (#721) Replace manual Header::set_path() + append() with builder.append_path_with_name() which emits GNU LongName extensions for paths exceeding the 100-byte POSIX tar name field limit. Fixes #705 Co-authored-by: John Myers --- crates/openshell-bootstrap/src/build.rs | 66 ++++++++++++++----------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index 9624e01b3..eaa221311 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -8,7 +8,6 @@ //! to import the image into the gateway's containerd runtime. use std::collections::HashMap; -use std::io::Read; use std::path::Path; use bollard::Docker; @@ -176,36 +175,10 @@ fn walk_and_add( if path.is_dir() { walk_and_add(root, &path, ignore_patterns, builder)?; } else { - let mut file = std::fs::File::open(&path) - .into_diagnostic() - .wrap_err_with(|| format!("failed to open file: {}", path.display()))?; - let metadata = file - .metadata() - .into_diagnostic() - .wrap_err_with(|| format!("failed to read metadata: {}", path.display()))?; - - let mut header = tar::Header::new_gnu(); - header.set_size(metadata.len()); - header.set_mode(0o644); - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - header.set_mode(metadata.permissions().mode()); - } - header - .set_path(&relative_normalized) - .into_diagnostic() - .wrap_err_with(|| format!("failed to set tar entry path: {relative_normalized}"))?; - header.set_cksum(); - - #[allow(clippy::cast_possible_truncation)] - let mut contents = Vec::with_capacity(metadata.len() as usize); - file.read_to_end(&mut contents) - .into_diagnostic() - .wrap_err_with(|| format!("failed to read file: {}", path.display()))?; - + // Use append_path_with_name which handles GNU LongName extensions + // for paths exceeding 100 bytes (the POSIX tar name field limit). builder - .append(&header, contents.as_slice()) + .append_path_with_name(&path, &relative_normalized) .into_diagnostic() .wrap_err_with(|| format!("failed to add file to tar: {relative_normalized}"))?; } @@ -433,6 +406,39 @@ mod tests { assert!(entries.iter().any(|e| e.contains("important.log"))); } + #[test] + fn test_long_path_exceeding_100_bytes() { + let dir = tempfile::tempdir().unwrap(); + let dir_path = dir.path(); + + // Build a nested path that exceeds 100 bytes when relative to root. + let deep_dir = dir_path.join( + "a/deeply/nested/directory/path/that/exceeds/one/hundred/bytes/total/from/the/build/context/root", + ); + fs::create_dir_all(&deep_dir).unwrap(); + fs::write(deep_dir.join("file.txt"), "deep content\n").unwrap(); + fs::write(dir_path.join("Dockerfile"), "FROM ubuntu:24.04\n").unwrap(); + + let tar_bytes = create_build_context_tar(dir_path).unwrap(); + let mut archive = tar::Archive::new(tar_bytes.as_slice()); + let entries: Vec = archive + .entries() + .unwrap() + .filter_map(std::result::Result::ok) + .map(|e| e.path().unwrap().to_string_lossy().to_string()) + .collect(); + + let long_entry = entries.iter().find(|e| e.contains("file.txt")); + assert!( + long_entry.is_some(), + "tar should contain deeply nested file; entries: {entries:?}" + ); + assert!( + long_entry.unwrap().len() > 100, + "path should exceed 100 bytes to exercise GNU LongName handling" + ); + } + #[test] fn test_simple_glob_match() { assert!(simple_glob_match("*.txt", "hello.txt")); From 2a4cf910054fef1e18c0f5be39764e7208aa4987 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 1 Apr 2026 13:34:12 -0700 Subject: [PATCH 29/45] fix(install): make checksum verification mandatory and validate redirect origin (#724) --- install.sh | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/install.sh b/install.sh index cf29ba746..565c48c09 100755 --- a/install.sh +++ b/install.sh @@ -92,9 +92,9 @@ download() { _output="$2" if has_cmd curl; then - curl -fLsS --retry 3 -o "$_output" "$_url" + curl -fLsS --retry 3 --max-redirs 5 -o "$_output" "$_url" elif has_cmd wget; then - wget -q --tries=3 -O "$_output" "$_url" + wget -q --tries=3 --max-redirect=5 -O "$_output" "$_url" fi } @@ -161,6 +161,18 @@ resolve_version() { _latest_url="${GITHUB_URL}/releases/latest" _resolved="$(resolve_redirect "$_latest_url")" || error "failed to resolve latest release from ${_latest_url}" + # Validate that the redirect stayed on the expected GitHub origin. + # A MITM or DNS hijack could redirect to an attacker-controlled domain, + # which would also serve a matching checksums file (making checksum + # verification useless). See: https://github.com/NVIDIA/OpenShell/issues/638 + case "$_resolved" in + https://github.com/${REPO}/releases/*) + ;; + *) + error "unexpected redirect target: ${_resolved} (expected https://github.com/${REPO}/releases/...)" + ;; + esac + # Extract the tag from the resolved URL: .../releases/tag/v0.0.4 -> v0.0.4 _version="${_resolved##*/}" @@ -180,20 +192,20 @@ verify_checksum() { _vc_checksums="$2" _vc_filename="$3" - _vc_expected="$(grep "$_vc_filename" "$_vc_checksums" | awk '{print $1}')" + if ! has_cmd shasum && ! has_cmd sha256sum; then + error "neither 'shasum' nor 'sha256sum' found; cannot verify download integrity" + fi + + _vc_expected="$(grep -F "$_vc_filename" "$_vc_checksums" | awk '{print $1}')" if [ -z "$_vc_expected" ]; then - warn "no checksum found for $_vc_filename, skipping verification" - return 0 + error "no checksum entry found for $_vc_filename in checksums file" fi if has_cmd shasum; then echo "$_vc_expected $_vc_archive" | shasum -a 256 -c --quiet 2>/dev/null elif has_cmd sha256sum; then echo "$_vc_expected $_vc_archive" | sha256sum -c --quiet 2>/dev/null - else - warn "sha256sum/shasum not found, skipping checksum verification" - return 0 fi } @@ -254,14 +266,13 @@ main() { error "failed to download ${_download_url}" fi - # Verify checksum + # Verify checksum (mandatory — never skip) info "verifying checksum..." - if download "$_checksums_url" "${_tmpdir}/checksums.txt"; then - if ! verify_checksum "${_tmpdir}/${_filename}" "${_tmpdir}/checksums.txt" "$_filename"; then - error "checksum verification failed for ${_filename}" - fi - else - warn "could not download checksums file, skipping verification" + if ! download "$_checksums_url" "${_tmpdir}/checksums.txt"; then + error "failed to download checksums file from ${_checksums_url}" + fi + if ! verify_checksum "${_tmpdir}/${_filename}" "${_tmpdir}/checksums.txt" "$_filename"; then + error "checksum verification failed for ${_filename}" fi # Extract From 219fbe756d5b722a4f6f7fffb5f1d034835cab02 Mon Sep 17 00:00:00 2001 From: Miyoung Choi Date: Wed, 1 Apr 2026 16:44:56 -0700 Subject: [PATCH 30/45] docs: add legal disclaimer and alpha banner (#726) * docs: add disclaimer and alpha banner * docs: small fix --- README.md | 6 +++++- docs/conf.py | 4 ++++ docs/index.md | 8 ++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5549fa7b4..a4bfac641 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# OpenShell +# NVIDIA OpenShell [![License](https://img.shields.io/badge/License-Apache_2.0-blue)](https://github.com/NVIDIA/OpenShell/blob/main/LICENSE) [![PyPI](https://img.shields.io/badge/PyPI-openshell-orange?logo=pypi)](https://pypi.org/project/openshell/) @@ -229,6 +229,10 @@ All implementation work is human-gated — agents propose plans, humans approve, OpenShell is built agent-first — your agent is your first collaborator. Before opening issues or submitting code, point your agent at the repo and let it use the skills in `.agents/skills/` to investigate, diagnose, and prototype. See [CONTRIBUTING.md](CONTRIBUTING.md) for the full agent skills table, contribution workflow, and development setup. +## Notice and Disclaimer + +This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk. + ## License This project is licensed under the [Apache License 2.0](https://github.com/NVIDIA/OpenShell/blob/main/LICENSE). diff --git a/docs/conf.py b/docs/conf.py index a54d72435..9afa14409 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -93,6 +93,10 @@ highlight_language = "console" html_theme_options = { + "announcement": ( + "🔔 NVIDIA OpenShell is alpha software. APIs and behavior" + " may change without notice. Do not use in production." + ), "icon_links": [ { "name": "GitHub", diff --git a/docs/index.md b/docs/index.md index 37d49047b..1a825beeb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -176,6 +176,14 @@ Policy schema, environment variables, and system architecture. :::: +--- + +```{admonition} Notice and Disclaimer +:class: warning + +This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk. +``` + ```{toctree} :hidden: From e271180f0b469b48dfb8f9b7af9874f09e424ca1 Mon Sep 17 00:00:00 2001 From: Miyoung Choi Date: Wed, 1 Apr 2026 16:45:42 -0700 Subject: [PATCH 31/45] docs: add security best practices (#714) * docs: add security best practices * docs: improve per style guide * docs: add xref to nemoclaw --- docs/index.md | 17 ++ docs/security/best-practices.md | 319 ++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+) create mode 100644 docs/security/best-practices.md diff --git a/docs/index.md b/docs/index.md index 1a825beeb..89979a76e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -174,6 +174,16 @@ Policy schema, environment variables, and system architecture. {bdg-secondary}`Reference` ::: +:::{grid-item-card} Security Best Practices +:link: security/best-practices +:link-type: doc + +Every configurable security control, its default, and the risk of changing it. + ++++ +{bdg-secondary}`Concept` +::: + :::: --- @@ -238,6 +248,13 @@ reference/policy-schema reference/support-matrix ``` +```{toctree} +:caption: Security +:hidden: + +security/best-practices +``` + ```{toctree} :caption: Resources :hidden: diff --git a/docs/security/best-practices.md b/docs/security/best-practices.md new file mode 100644 index 000000000..a54896f5a --- /dev/null +++ b/docs/security/best-practices.md @@ -0,0 +1,319 @@ +--- +title: + page: "OpenShell Security Best Practices — Controls, Risks, and Configuration Guidance" + nav: "Security Best Practices" +description: "A guide to every configurable security control in OpenShell: defaults, what you can change, and the risks of each choice." +topics: +- Generative AI +- Cybersecurity +tags: +- Security +- Policy +- Sandbox +- Landlock +- Seccomp +content: + type: concept + difficulty: intermediate + audience: + - engineer + - security_engineer +status: published +--- + + + +# Security Best Practices + +OpenShell enforces sandbox security across four layers: network, filesystem, process, and inference. +This page documents every configurable control, its default, what it protects, and the risk of relaxing it. + +For the full policy YAML schema, refer to the {doc}`../reference/policy-schema`. +For the architecture of each enforcement layer, refer to {doc}`../about/architecture`. + +:::{seealso} +If you use [NemoClaw](https://github.com/NVIDIA/NemoClaw) to run OpenClaw assistants, its [Security Best Practices](https://docs.nvidia.com/nemoclaw/latest/security/best-practices.html) guide covers additional entrypoint-level controls, policy presets, provider trust tiers, and posture profiles specific to the NemoClaw blueprint. +::: + +## Enforcement Layers + +OpenShell applies security controls at two enforcement points. +OpenShell locks static controls at sandbox creation and requires destroying and recreating the sandbox to change them. +You can update dynamic controls on a running sandbox with `openshell policy set`. + +:::{list-table} +:header-rows: 1 +:widths: 20 30 20 30 + +* - Layer + - What it protects + - Enforcement point + - Changeable at runtime + +* - Network + - Unauthorized outbound connections and data exfiltration. + - CONNECT proxy + OPA policy engine + - Yes. Use `openshell policy set` or operator approval in the TUI. + +* - Filesystem + - System binary tampering, credential theft, config manipulation. + - Landlock LSM (kernel level) + - No. Requires sandbox re-creation. + +* - Process + - Privilege escalation, fork bombs, dangerous syscalls. + - Seccomp BPF + privilege drop (`setuid`/`setgid`) + - No. Requires sandbox re-creation. + +* - Inference + - Credential exposure, unauthorized model access. + - Proxy intercept of `inference.local` + - Yes. Use `openshell inference set`. + +::: + +## Network Controls + +The CONNECT proxy and OPA policy engine enforce all network controls at the gateway level. + +### Deny-by-Default Egress + +Every outbound connection from the sandbox goes through the CONNECT proxy. +The proxy evaluates each connection against the OPA policy engine. +If no `network_policies` entry matches the destination host, port, and calling binary, the proxy denies the connection. + +| Aspect | Detail | +|---|---| +| Default | All egress denied. Only endpoints listed in `network_policies` can receive traffic. | +| What you can change | Add entries to `network_policies` in the policy YAML. Apply statically at creation (`--policy`) or dynamically (`openshell policy set`). | +| Risk if relaxed | Each allowed endpoint is a potential data exfiltration path. The agent can send workspace content, credentials, or conversation history to any reachable host. | +| Recommendation | Add only endpoints the agent needs for its task. Start with a minimal policy and use denied-request logs (`openshell logs --source sandbox`) to identify missing endpoints. | + +### Network Namespace Isolation + +The sandbox runs in a dedicated Linux network namespace with a veth pair. +All traffic routes through the host-side veth IP (`10.200.0.1`) where the proxy listens. +Even if a process ignores proxy environment variables, it can only reach the proxy. + +| Aspect | Detail | +|---|---| +| Default | Always active. The sandbox cannot bypass the proxy at the network level. | +| What you can change | This is not a user-facing knob. OpenShell always enforces it in proxy mode. | +| Risk if bypassed | Without network namespace isolation, a process could connect directly to the internet, bypassing all policy enforcement. | +| Recommendation | No action needed. OpenShell enforces this automatically. | + +### Binary Identity Binding + +The proxy identifies which binary initiated each connection by reading `/proc//exe` (the kernel-trusted executable path). +It walks the process tree for ancestor binaries and parses `/proc//cmdline` for script interpreters. +The proxy SHA256-hashes each binary on first use (trust-on-first-use). If someone replaces a binary mid-session, the hash mismatch triggers an immediate deny. + +| Aspect | Detail | +|---|---| +| Default | Every `network_policies` entry requires a `binaries` list. Only listed binaries can reach the associated endpoints. Binary paths support glob patterns (`*` for one path component, `**` for recursive). | +| What you can change | Add binaries to an endpoint entry. Use glob patterns for directory-scoped access (for example, `/sandbox/.vscode-server/**`). | +| Risk if relaxed | Broad glob patterns (like `/**`) allow any binary to reach the endpoint, defeating the purpose of binary-scoped enforcement. | +| Recommendation | Scope binaries to the specific executables that need each endpoint. Use narrow globs when the exact path varies (for example, across Python virtual environments). | + +### L4-Only vs L7 Inspection + +The `protocol` field on an endpoint controls whether the proxy inspects individual HTTP requests inside the tunnel. + +| Aspect | Detail | +|---|---| +| Default | Endpoints without a `protocol` field use L4-only enforcement: the proxy checks host, port, and binary, then relays the TCP stream without inspecting payloads. | +| What you can change | Add `protocol: rest` to enable per-request HTTP inspection. Pair it with `rules` (fine-grained method and path control) or `access` presets (`full`, `read-only`, `read-write`). | +| Risk if relaxed | L4-only endpoints allow the agent to send any data through the tunnel after the initial connection is permitted. The proxy cannot see HTTP methods, paths, or bodies. Adding `access: full` with `protocol: rest` enables inspection but permits all methods and paths, providing observability without restriction. | +| Recommendation | Use `protocol: rest` with specific `rules` for APIs where you want method and path control. Use `access: read-only` for read-only endpoints. Omit `protocol` for non-HTTP protocols (WebSocket, gRPC streaming). | + +### Enforcement Mode (`audit` vs `enforce`) + +When `protocol: rest` is active, the `enforcement` field controls whether the proxy blocks or logs rule violations. + +| Aspect | Detail | +|---|---| +| Default | `audit`. The proxy logs violations but forwards traffic. | +| What you can change | Set `enforcement: enforce` to block requests that do not match any `rules` entry. Denied requests receive a `403 Forbidden` response with a JSON body describing the violation. | +| Risk if relaxed | `audit` mode provides visibility but does not prevent unauthorized actions. An agent can still perform write or delete operations on an API even if the rules would deny them. | +| Recommendation | Start with `audit` to understand traffic patterns and verify that rules are correct. Switch to `enforce` once you have validated that the rules match the intended access pattern. | + +### TLS Handling + +The proxy auto-detects TLS on every tunnel by peeking the first bytes. +When a TLS ClientHello is detected, the proxy terminates TLS transparently using a per-sandbox ephemeral CA. +This enables credential injection and L7 inspection without explicit configuration. + +| Aspect | Detail | +|---|---| +| Default | Auto-detect and terminate. OpenShell generates the sandbox CA at startup and injects it into the process trust stores (`NODE_EXTRA_CA_CERTS`, `SSL_CERT_FILE`, `REQUESTS_CA_BUNDLE`, `CURL_CA_BUNDLE`). | +| What you can change | Set `tls: skip` on an endpoint to disable TLS detection and termination for that endpoint. Use this for client-certificate mTLS to upstream or non-standard binary protocols. | +| Risk if relaxed | `tls: skip` disables credential injection and L7 inspection for that endpoint. The proxy relays encrypted traffic without seeing the contents. | +| Recommendation | Use auto-detect (the default) for most endpoints. Use `tls: skip` only when the upstream requires the client's own TLS certificate (mTLS) or uses a non-HTTP protocol. | + +### SSRF Protection + +After OPA policy allows a connection, the proxy resolves DNS and rejects connections where the resolved IP is internal (loopback, link-local, or RFC 1918 private). + +| Aspect | Detail | +|---|---| +| Default | The proxy blocks all private IPs. Loopback (`127.0.0.0/8`) and link-local (`169.254.0.0/16`) remain blocked even with `allowed_ips`. | +| What you can change | Add `allowed_ips` (CIDR notation) to an endpoint to permit connections to specific private IP ranges. | +| Risk if relaxed | Without SSRF protection, a misconfigured policy could allow the agent to reach cloud metadata services (`169.254.169.254`), internal databases, or other infrastructure endpoints through DNS rebinding. | +| Recommendation | Use `allowed_ips` only for known internal services. Scope the CIDR as narrowly as possible (for example, `10.0.5.20/32` for a single host). Loopback and link-local are always blocked regardless of `allowed_ips`. | + +### Operator Approval + +When the agent requests an endpoint not in the policy, OpenShell blocks it and surfaces the request in the TUI for operator review. +The system merges approved endpoints into the sandbox's policy as a new durable revision. + +| Aspect | Detail | +|---|---| +| Default | Enabled. The proxy blocks unlisted endpoints and requires approval. | +| What you can change | Approved endpoints persist across sandbox restarts within the same sandbox instance. They reset when the sandbox is destroyed and recreated. | +| Risk if relaxed | Approving an endpoint permanently widens the running sandbox's policy. Review each request before approving. | +| Recommendation | Use operator approval for exploratory work. For recurring endpoints, add them to the policy YAML with appropriate binary and path restrictions. To reset all approved endpoints, destroy and recreate the sandbox. | + +## Filesystem Controls + +Landlock LSM restricts which paths the sandbox process can read or write at the kernel level. + +### Landlock LSM + +Landlock enforces filesystem access at the kernel level. +Paths listed in `read_only` receive read-only access. +Paths listed in `read_write` receive full access. +All other paths are inaccessible. + +| Aspect | Detail | +|---|---| +| Default | `compatibility: best_effort`. Uses the highest kernel ABI available. The system skips missing paths with a warning. If the kernel does not support Landlock, the sandbox continues without filesystem restrictions. | +| What you can change | Set `compatibility: hard_requirement` to abort sandbox startup if Landlock is unavailable or any configured path cannot be opened. | +| Risk if relaxed | On kernels without Landlock (pre-5.13), or when all paths fail to open, the sandbox runs without kernel-level filesystem restrictions. The agent can access any file the process user can access. | +| Recommendation | Use `best_effort` for development. Use `hard_requirement` in environments where any gap in filesystem isolation is unacceptable. Run on Ubuntu 22.04+ or any kernel 5.13+ for Landlock support. | + +### Read-Only vs Read-Write Paths + +The policy separates filesystem paths into read-only and read-write groups. + +| Aspect | Detail | +|---|---| +| Default | System paths (`/usr`, `/lib`, `/etc`, `/var/log`) are read-only. Working paths (`/sandbox`, `/tmp`) are read-write. `/app` is conditionally included if it exists. | +| What you can change | Add or remove paths in `filesystem_policy.read_only` and `filesystem_policy.read_write`. | +| Risk if relaxed | Making system paths writable lets the agent replace binaries, modify TLS trust stores, or change DNS resolution. Validation rejects broad read-write paths (like `/`). | +| Recommendation | Keep system paths read-only. If the agent needs additional writable space, add a specific subdirectory. | + +### Path Validation + +OpenShell validates policies before they take effect. + +| Constraint | Behavior | +|---|---| +| Paths must be absolute (start with `/`). | Rejected with `INVALID_ARGUMENT`. | +| Paths must not contain `..` traversal. | Rejected with `INVALID_ARGUMENT`. | +| Read-write paths must not be overly broad (for example, `/` alone). | Rejected with `INVALID_ARGUMENT`. | +| Each path must not exceed 4096 characters. | Rejected with `INVALID_ARGUMENT`. | +| Combined `read_only` + `read_write` paths must not exceed 256. | Rejected with `INVALID_ARGUMENT`. | + +## Process Controls + +The sandbox supervisor drops privileges, applies seccomp filters, and enforces process-level restrictions during startup. + +### Privilege Drop + +The sandbox process runs as a non-root user after explicit privilege dropping. + +| Aspect | Detail | +|---|---| +| Default | `run_as_user: sandbox`, `run_as_group: sandbox`. The supervisor calls `setuid()`/`setgid()` with post-condition verification: confirms the effective UID/GID match the target and that `setuid(0)` fails (root cannot be re-acquired). | +| What you can change | Set `run_as_user` and `run_as_group` in the `process` section. Validation rejects root (`root` or `0`). | +| Risk if relaxed | Running as a higher-privilege user increases the impact of container escape vulnerabilities. | +| Recommendation | Keep the `sandbox` user. Do not attempt to set root. | + +### Seccomp Filters + +A BPF seccomp filter restricts which socket domains the sandbox process can use. + +| Aspect | Detail | +|---|---| +| Default | The filter allows `AF_INET` and `AF_INET6` (for proxy communication) and blocks `AF_NETLINK`, `AF_PACKET`, `AF_BLUETOOTH`, and `AF_VSOCK` with `EPERM`. The sandbox sets `PR_SET_NO_NEW_PRIVS` before applying the filter. | +| What you can change | This is not a user-facing knob. OpenShell enforces it automatically. | +| Risk if relaxed | `AF_NETLINK` allows manipulation of routing tables and firewall rules. `AF_PACKET` enables raw packet capture. `AF_VSOCK` enables VM socket communication. | +| Recommendation | No action needed. OpenShell enforces this automatically. | + +### Enforcement Application Order + +The sandbox supervisor applies enforcement in a specific order during process startup. +This ordering is intentional: privilege dropping needs `/etc/group` and `/etc/passwd`, which Landlock subsequently restricts. + +1. Network namespace entry (`setns`). +2. Privilege drop (`initgroups` + `setgid` + `setuid`). +3. Landlock filesystem restrictions. +4. Seccomp socket domain filters. + +## Inference Controls + +OpenShell routes all inference traffic through the gateway to isolate provider credentials from the sandbox. + +### Routed Inference through `inference.local` + +The proxy intercepts HTTPS CONNECT requests to `inference.local` and routes matching inference API requests through the sandbox-local router. +The agent never receives the provider API key. + +| Aspect | Detail | +|---|---| +| Default | Always active. The proxy handles `inference.local` before OPA policy evaluation. The gateway injects credentials on the host side. | +| What you can change | Configure inference routes with `openshell inference set`. | +| Risk if bypassed | If an inference provider's host is added directly to `network_policies`, the agent could reach it with a stolen or hardcoded key, bypassing credential isolation. | +| Recommendation | Do not add inference provider hosts to `network_policies`. Use OpenShell inference routing instead. | + +## Gateway Security + +The gateway secures communication between the CLI, sandbox pods, and external clients with mutual TLS and token-based authentication. + +### mTLS + +Communication between the CLI, sandbox pods, and the gateway is secured by mutual TLS. +OpenShell generates a cluster CA at bootstrap and distributes it through Kubernetes secrets. + +| Aspect | Detail | +|---|---| +| Default | mTLS required. Both client and server present certificates that the cluster CA signed. | +| What you can change | Enable dual-auth mode (`allow_unauthenticated=true`) for Cloudflare Tunnel deployments, or disable TLS entirely for trusted reverse-proxy setups. | +| Risk if relaxed | Dual-auth mode accepts clients without certificates and defers authentication to the HTTP layer (Cloudflare JWT). Disabling TLS removes transport-level authentication entirely. | +| Recommendation | Use mTLS (the default) unless deploying behind Cloudflare or a trusted reverse proxy. | + +### SSH Tunnel Authentication + +SSH connections to sandboxes pass through the gateway's HTTP CONNECT tunnel with token-based authentication and HMAC-SHA256 handshake verification (NSSH1 protocol). + +| Aspect | Detail | +|---|---| +| Default | Session tokens expire after 24 hours. Concurrent connections are limited to 10 per token and 20 per sandbox. | +| What you can change | Configure `ssh_session_ttl_secs`. Set to 0 for no expiry. | +| Risk if relaxed | Longer TTLs or no expiry increase the window for stolen token reuse. Higher connection limits increase the blast radius of a compromised token. | +| Recommendation | Keep the 24-hour default. Monitor connection counts through the TUI. | + +## Common Mistakes + +The following patterns weaken security without providing meaningful benefit. + +| Mistake | Why it matters | What to do instead | +|---------|---------------|-------------------| +| Omitting `protocol: rest` on REST API endpoints | Without `protocol: rest`, the proxy uses L4-only enforcement. It allows the TCP stream through after checking host, port, and binary, but cannot inspect individual HTTP requests. | Add `protocol: rest` with specific `rules` to enable per-request method and path control. | +| Using `access: full` when finer rules would suffice | `access: full` with `protocol: rest` enables inspection but allows all HTTP methods and paths. | Use `access: read-only` or explicit `rules` to restrict what the agent can do at the HTTP level. | +| Adding endpoints permanently when operator approval would suffice | Adding endpoints to the policy YAML makes them permanently reachable across all instances. | Use operator approval. Approved endpoints persist within the sandbox instance but reset on re-creation. | +| Using broad binary globs | A glob like `/**` allows any binary to reach the endpoint, defeating binary-scoped enforcement. | Scope globs to specific directories (for example, `/sandbox/.vscode-server/**`). | +| Skipping TLS termination on HTTPS APIs | Setting `tls: skip` disables credential injection and L7 inspection. | Use the default auto-detect behavior unless the upstream requires client-certificate mTLS. | +| Setting `enforcement: enforce` before auditing | Jumping to `enforce` without first running in `audit` mode risks breaking the agent's workflow. | Start with `audit`, review the logs, and switch to `enforce` once you have validated the rules. | + +## Related Topics + +- {doc}`../sandboxes/policies` for applying and iterating on sandbox policies. +- {doc}`../reference/policy-schema` for the full field-by-field YAML reference. +- {doc}`../reference/default-policy` for the built-in default policy breakdown. +- {doc}`../reference/gateway-auth` for gateway authentication details. +- {doc}`../about/architecture` for the system architecture. +- NemoClaw [Security Best Practices](https://docs.nvidia.com/nemoclaw/latest/security/best-practices.html) for entrypoint-level controls (capability drops, PATH hardening, build toolchain removal), policy presets, provider trust tiers, and posture profiles. From d9e8fe55a3823983d9ccdfbc11237d711c4a3087 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:11:47 -0700 Subject: [PATCH 32/45] fix(cli): add missing Copilot variant to CliProviderType enum (#713) PR #476 added Copilot as a supported provider but missed adding the Copilot variant to the CliProviderType enum, causing --provider copilot to fail at CLI argument parsing. Also adds a parity test to prevent future drift between CliProviderType and ProviderRegistry. Closes #707 Co-authored-by: John Myers --- crates/openshell-cli/src/main.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 4f4d49695..5f10ef903 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -597,6 +597,7 @@ enum CliProviderType { Claude, Opencode, Codex, + Copilot, Generic, Openai, Anthropic, @@ -627,6 +628,7 @@ impl CliProviderType { Self::Claude => "claude", Self::Opencode => "opencode", Self::Codex => "codex", + Self::Copilot => "copilot", Self::Generic => "generic", Self::Openai => "openai", Self::Anthropic => "anthropic", @@ -3137,4 +3139,29 @@ mod tests { other => panic!("expected settings delete command, got: {other:?}"), } } + + /// Ensure every provider registered in `ProviderRegistry` has a + /// corresponding `CliProviderType` variant (and vice-versa). + /// This test would have caught the missing `Copilot` variant from #707. + #[test] + fn cli_provider_types_match_registry() { + let registry = openshell_providers::ProviderRegistry::new(); + let registry_types: std::collections::BTreeSet<&str> = + registry.known_types().into_iter().collect(); + + let cli_types: std::collections::BTreeSet<&str> = + ::value_variants() + .iter() + .map(CliProviderType::as_str) + .collect(); + + assert_eq!( + cli_types, + registry_types, + "CliProviderType variants must match ProviderRegistry.known_types(). \ + CLI-only: {:?}, Registry-only: {:?}", + cli_types.difference(®istry_types).collect::>(), + registry_types.difference(&cli_types).collect::>(), + ); + } } From 1c659c1c12602e5d492e3ed85ba169246425017b Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 18:04:25 -0700 Subject: [PATCH 33/45] fix(sandbox/bootstrap): GPU Landlock baseline paths and CDI spec missing diagnosis (#710) * fix(sandbox): add GPU device nodes and nvidia-persistenced to landlock baseline Landlock READ_FILE/WRITE_FILE restricts open(2) on character device files even when DAC permissions would otherwise allow it. GPU sandboxes need /dev/nvidiactl, /dev/nvidia-uvm, /dev/nvidia-uvm-tools, /dev/nvidia-modeset, and per-GPU /dev/nvidiaX nodes in the policy to allow NVML initialization. Additionally, CDI bind-mounts /run/nvidia-persistenced/socket into the container. NVML tries to connect to this socket at init time; if the directory is not in the landlock policy, it receives EACCES (not ECONNREFUSED), which causes NVML to abort with NVML_ERROR_INSUFFICIENT_PERMISSIONS even though nvidia-persistenced is optional. Both classes of paths are auto-added to the baseline when /dev/nvidiactl is present. Per-GPU device nodes are enumerated at runtime to handle multi-GPU configurations. --- crates/openshell-bootstrap/src/errors.rs | 110 +++++++++++++ crates/openshell-sandbox/src/lib.rs | 200 +++++++++++++++++++++-- e2e/python/test_inference_routing.py | 2 +- e2e/python/test_sandbox_policy.py | 2 +- 4 files changed, 294 insertions(+), 20 deletions(-) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index b487c94a6..9e385c680 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -169,6 +169,21 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_docker_not_running, }, + // CDI specs missing — Docker daemon has CDI configured but no spec files exist + // or the requested device ID (nvidia.com/gpu=all) is not in any spec. + // Matches errors from Docker 25+ and containerd CDI injection paths. + FailurePattern { + matchers: &[ + "CDI device not found", + "unknown CDI device", + "failed to inject CDI devices", + "no CDI devices found", + "CDI device injection failed", + "unresolvable CDI devices", + ], + match_mode: MatchMode::Any, + diagnose: diagnose_cdi_specs_missing, + }, ]; fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { @@ -396,6 +411,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { } } +fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis { + GatewayFailureDiagnosis { + summary: "CDI specs not found on host".to_string(), + explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \ + directories configured) but no CDI device specs were found on the host. \ + Specs must be pre-generated before OpenShell can inject the GPU into the \ + cluster container." + .to_string(), + recovery_steps: vec![ + RecoveryStep::with_command( + "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)", + "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + ), + RecoveryStep::with_command( + "Verify the specs were generated and include nvidia.com/gpu entries", + "nvidia-ctk cdi list", + ), + RecoveryStep::new("Then retry: openshell gateway start --gpu"), + ], + retryable: false, + } +} + fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Docker is not running".to_string(), @@ -925,4 +963,76 @@ mod tests { "commands should include gateway name, got: {all_commands}" ); } + + #[test] + fn test_diagnose_cdi_device_not_found() { + let diagnosis = diagnose_failure( + "test", + "could not run container: CDI device not found: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_cdi_injection_failed_unresolvable() { + // Exact error observed from Docker 500 response + let diagnosis = diagnose_failure( + "test", + "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_unknown_cdi_device() { + // containerd error path + let diagnosis = diagnose_failure( + "test", + "unknown CDI device requested: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + } + + #[test] + fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() { + let d = diagnose_failure("test", "CDI device not found", None) + .expect("should match CDI pattern"); + let all_steps: String = d + .recovery_steps + .iter() + .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or(""))) + .collect::>() + .join("\n"); + assert!( + all_steps.contains("nvidia-ctk cdi generate"), + "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}" + ); + assert!( + all_steps.contains("/etc/cdi/"), + "recovery steps should mention /etc/cdi/, got: {all_steps}" + ); + } } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 149632446..2384a2170 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -877,13 +877,106 @@ pub(crate) fn spawn_route_refresh( /// Minimum read-only paths required for a proxy-mode sandbox child process to /// function: dynamic linker, shared libraries, DNS resolution, CA certs, -/// Python venv, and openshell logs. -const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"]; +/// Python venv, openshell logs, process info, and random bytes. +/// +/// `/proc` and `/dev/urandom` are included here for the same reasons they +/// appear in `restrictive_default_policy()`: virtually every process needs +/// them. Before the Landlock per-path fix (#677) these were effectively free +/// because a missing path silently disabled the entire ruleset; now they must +/// be explicit. +const PROXY_BASELINE_READ_ONLY: &[&str] = &[ + "/usr", + "/lib", + "/etc", + "/app", + "/var/log", + "/proc", + "/dev/urandom", +]; /// Minimum read-write paths required for a proxy-mode sandbox child process: /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; +/// GPU read-only paths. +/// +/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced +/// socket at init time. If the directory exists but Landlock denies traversal +/// (EACCES vs ECONNREFUSED), NVML returns `NVML_ERROR_INSUFFICIENT_PERMISSIONS` +/// even though the daemon is optional. Only read/traversal access is needed. +const GPU_BASELINE_READ_ONLY: &[&str] = &["/run/nvidia-persistenced"]; + +/// GPU read-write paths (static). +/// +/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, +/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. +/// Landlock restricts `open(2)` on device files even when DAC allows it; +/// these need read-write because NVML/CUDA opens them with `O_RDWR`. +/// +/// `/proc`: CUDA writes to `/proc//task//comm` during `cuInit()` +/// to set thread names. Without write access, `cuInit()` returns error 304. +/// Must use `/proc` (not `/proc/self/task`) because Landlock rules bind to +/// inodes and child processes have different procfs inodes than the parent. +/// +/// Per-GPU device files (`/dev/nvidia0`, …) are enumerated at runtime by +/// `enumerate_gpu_device_nodes()` since the count varies. +const GPU_BASELINE_READ_WRITE: &[&str] = &[ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + "/proc", +]; + +/// Returns true if GPU devices are present in the container. +fn has_gpu_devices() -> bool { + std::path::Path::new("/dev/nvidiactl").exists() +} + +/// Enumerate per-GPU device nodes (`/dev/nvidia0`, `/dev/nvidia1`, …). +fn enumerate_gpu_device_nodes() -> Vec { + let mut paths = Vec::new(); + if let Ok(entries) = std::fs::read_dir("/dev") { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if let Some(suffix) = name.strip_prefix("nvidia") { + if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) { + continue; + } + paths.push(entry.path().to_string_lossy().into_owned()); + } + } + } + paths +} + +/// Collect all baseline paths for enrichment: proxy defaults + GPU (if present). +/// Returns `(read_only, read_write)` as owned `String` vecs. +fn baseline_enrichment_paths() -> (Vec, Vec) { + let mut ro: Vec = PROXY_BASELINE_READ_ONLY + .iter() + .map(|&s| s.to_string()) + .collect(); + let mut rw: Vec = PROXY_BASELINE_READ_WRITE + .iter() + .map(|&s| s.to_string()) + .collect(); + + if has_gpu_devices() { + ro.extend(GPU_BASELINE_READ_ONLY.iter().map(|&s| s.to_string())); + rw.extend(GPU_BASELINE_READ_WRITE.iter().map(|&s| s.to_string())); + rw.extend(enumerate_gpu_device_nodes()); + } + + // A path promoted to read_write (e.g. /proc for GPU) should not also + // appear in read_only — Landlock handles the overlap correctly but the + // duplicate is confusing when inspecting the effective policy. + ro.retain(|p| !rw.contains(p)); + + (ro, rw) +} + /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths /// required for proxy-mode sandboxes. Paths are only added if missing; /// user-specified paths are never removed. @@ -902,14 +995,16 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ..Default::default() }); + let (ro, rw) = baseline_enrichment_paths(); + + // Baseline paths are system-injected, not user-specified. Skip paths + // that do not exist in this container image to avoid noisy warnings from + // Landlock and, more critically, to prevent a single missing baseline + // path from abandoning the entire Landlock ruleset under best-effort + // mode (see issue #664). let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { - if !fs.read_only.iter().any(|p| p.as_str() == path) { - // Baseline paths are system-injected, not user-specified. Skip - // paths that do not exist in this container image to avoid noisy - // warnings from Landlock and, more critically, to prevent a single - // missing baseline path from abandoning the entire Landlock - // ruleset under best-effort mode (see issue #664). + for path in &ro { + if !fs.read_only.iter().any(|p| p == path) && !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { debug!( path, @@ -917,12 +1012,12 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ); continue; } - fs.read_only.push(path.to_string()); + fs.read_only.push(path.clone()); modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { - if !fs.read_write.iter().any(|p| p.as_str() == path) { + for path in &rw { + if !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { debug!( path, @@ -930,7 +1025,7 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ); continue; } - fs.read_write.push(path.to_string()); + fs.read_write.push(path.clone()); modified = true; } } @@ -950,12 +1045,12 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { return; } + let (ro, rw) = baseline_enrichment_paths(); + let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { + for path in &ro { let p = std::path::PathBuf::from(path); - if !policy.filesystem.read_only.contains(&p) { - // Baseline paths are system-injected — skip non-existent paths to - // avoid Landlock ruleset abandonment (issue #664). + if !policy.filesystem.read_only.contains(&p) && !policy.filesystem.read_write.contains(&p) { if !p.exists() { debug!( path, @@ -967,7 +1062,7 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { + for path in &rw { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_write.contains(&p) { if !p.exists() { @@ -987,6 +1082,75 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } +#[cfg(test)] +mod baseline_tests { + use super::*; + + #[test] + fn proc_not_in_both_read_only_and_read_write_when_gpu_present() { + // When GPU devices are present, /proc is promoted to read_write + // (CUDA needs to write /proc//task//comm). It should + // NOT also appear in read_only. + if !has_gpu_devices() { + // Can't test GPU dedup without GPU devices; skip silently. + return; + } + let (ro, rw) = baseline_enrichment_paths(); + assert!( + rw.contains(&"/proc".to_string()), + "/proc should be in read_write when GPU is present" + ); + assert!( + !ro.contains(&"/proc".to_string()), + "/proc should NOT be in read_only when it is already in read_write" + ); + } + + #[test] + fn proc_in_read_only_without_gpu() { + if has_gpu_devices() { + // On a GPU host we can't test the non-GPU path; skip silently. + return; + } + let (ro, _rw) = baseline_enrichment_paths(); + assert!( + ro.contains(&"/proc".to_string()), + "/proc should be in read_only when GPU is not present" + ); + } + + #[test] + fn baseline_read_write_always_includes_sandbox_and_tmp() { + let (_ro, rw) = baseline_enrichment_paths(); + assert!(rw.contains(&"/sandbox".to_string())); + assert!(rw.contains(&"/tmp".to_string())); + } + + #[test] + fn enumerate_gpu_device_nodes_skips_bare_nvidia() { + // "nvidia" (without a trailing digit) is a valid /dev entry on some + // systems but is not a per-GPU device node. The enumerator must + // not match it. + let nodes = enumerate_gpu_device_nodes(); + assert!( + !nodes.contains(&"/dev/nvidia".to_string()), + "bare /dev/nvidia should not be enumerated: {nodes:?}" + ); + } + + #[test] + fn no_duplicate_paths_in_baseline() { + let (ro, rw) = baseline_enrichment_paths(); + // No path should appear in both lists. + for path in &ro { + assert!( + !rw.contains(path), + "path {path} appears in both read_only and read_write" + ); + } + } +} + /// Load sandbox policy from local files or gRPC. /// /// Priority: diff --git a/e2e/python/test_inference_routing.py b/e2e/python/test_inference_routing.py index c35e02535..c9d3965f1 100644 --- a/e2e/python/test_inference_routing.py +++ b/e2e/python/test_inference_routing.py @@ -32,7 +32,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort") diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index 6a4bf5ed2..625fe8da0 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -22,7 +22,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort") From c6f3087888cac69ba09fdd22ec111734497d3e7c Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Wed, 1 Apr 2026 21:39:41 -0700 Subject: [PATCH 34/45] fix(sandbox): relay WebSocket frames after HTTP 101 Switching Protocols (#718) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detect 101 Switching Protocols in relay_response() and switch to raw bidirectional TCP relay instead of re-entering the HTTP parsing loop. Previously, is_bodiless_response() treated 101 as a generic 1xx informational response, forwarding only the headers and returning to the HTTP parsing loop. After a 101, subsequent bytes are upgraded protocol frames (e.g. WebSocket), not HTTP — causing the relay to block or silently drop all post-upgrade traffic. Changes: - Add RelayOutcome enum (Reusable/Consumed/Upgraded) replacing bool return type across L7Provider::relay trait and all relay functions - Detect 101 before generic 1xx handler in relay_response(), capture overflow bytes, return RelayOutcome::Upgraded - Validate client sent Upgrade + Connection: Upgrade headers before accepting 101 (rejects unsolicited upgrades from non-compliant upstream servers) - Extract shared handle_upgrade() helper used by both relay_rest() and relay_passthrough_with_credentials() - Add l7_decision=allow_upgrade audit log annotation for upgrades - Add unit tests for 101 overflow capture, unsolicited 101 rejection, and client_requested_upgrade header validation - Add integration test: WebSocket echo through L7Provider::relay Fixes: #652 Co-authored-by: John Myers --- Cargo.lock | 2 + crates/openshell-sandbox/Cargo.toml | 2 + crates/openshell-sandbox/src/l7/provider.rs | 23 +- crates/openshell-sandbox/src/l7/relay.rs | 90 +++- crates/openshell-sandbox/src/l7/rest.rs | 392 ++++++++++++++++-- .../tests/websocket_upgrade.rs | 259 ++++++++++++ 6 files changed, 712 insertions(+), 56 deletions(-) create mode 100644 crates/openshell-sandbox/tests/websocket_upgrade.rs diff --git a/Cargo.lock b/Cargo.lock index e49301771..8f827bc88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2939,6 +2939,7 @@ dependencies = [ "base64 0.22.1", "bytes", "clap", + "futures", "hex", "hmac", "ipnet", @@ -2965,6 +2966,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-stream", + "tokio-tungstenite 0.26.2", "tonic", "tracing", "tracing-appender", diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 26da57efd..68e696e95 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -81,6 +81,8 @@ uuid = { version = "1", features = ["v4"] } [dev-dependencies] tempfile = "3" temp-env = "0.3" +tokio-tungstenite = { workspace = true } +futures = { workspace = true } [lints] workspace = true diff --git a/crates/openshell-sandbox/src/l7/provider.rs b/crates/openshell-sandbox/src/l7/provider.rs index df0dfb292..7516aa85c 100644 --- a/crates/openshell-sandbox/src/l7/provider.rs +++ b/crates/openshell-sandbox/src/l7/provider.rs @@ -14,6 +14,22 @@ use std::collections::HashMap; use std::future::Future; use tokio::io::{AsyncRead, AsyncWrite}; +/// Outcome of relaying a single HTTP request/response pair. +#[derive(Debug)] +pub enum RelayOutcome { + /// Connection is reusable for further HTTP requests (keep-alive). + Reusable, + /// Connection was consumed (e.g. read-until-EOF or `Connection: close`). + Consumed, + /// Server responded with 101 Switching Protocols. + /// The connection has been upgraded (e.g. to WebSocket) and must be + /// relayed as raw bidirectional TCP from this point forward. + /// Contains any overflow bytes read from upstream past the 101 response + /// headers that belong to the upgraded protocol. The 101 headers + /// themselves have already been forwarded to the client. + Upgraded { overflow: Vec }, +} + /// Body framing for HTTP requests/responses. #[derive(Debug, Clone, Copy)] pub enum BodyLength { @@ -57,14 +73,15 @@ pub trait L7Provider: Send + Sync { /// Forward an allowed request to upstream and relay the response back. /// - /// Returns `true` if the upstream connection is reusable (keep-alive), - /// `false` if it was consumed (e.g. read-until-EOF or `Connection: close`). + /// Returns a [`RelayOutcome`] indicating whether the connection is + /// reusable (keep-alive), consumed, or has been upgraded (101 Switching + /// Protocols) and must be relayed as raw bidirectional TCP. fn relay( &self, req: &L7Request, client: &mut C, upstream: &mut U, - ) -> impl Future> + Send + ) -> impl Future> + Send where C: AsyncRead + AsyncWrite + Unpin + Send, U: AsyncRead + AsyncWrite + Unpin + Send; diff --git a/crates/openshell-sandbox/src/l7/relay.rs b/crates/openshell-sandbox/src/l7/relay.rs index 49caea64d..b2fb34b61 100644 --- a/crates/openshell-sandbox/src/l7/relay.rs +++ b/crates/openshell-sandbox/src/l7/relay.rs @@ -7,7 +7,7 @@ //! Parses each request within the tunnel, evaluates it against OPA policy, //! and either forwards or denies the request. -use crate::l7::provider::L7Provider; +use crate::l7::provider::{L7Provider, RelayOutcome}; use crate::l7::{EnforcementMode, L7EndpointConfig, L7Protocol, L7RequestInfo}; use crate::secrets::{self, SecretResolver}; use miette::{IntoDiagnostic, Result, miette}; @@ -68,6 +68,40 @@ where } } +/// Handle an upgraded connection (101 Switching Protocols). +/// +/// Forwards any overflow bytes from the upgrade response to the client, then +/// switches to raw bidirectional TCP copy for the upgraded protocol (WebSocket, +/// HTTP/2, etc.). L7 policy enforcement does not apply after the upgrade — +/// the initial HTTP request was already evaluated. +async fn handle_upgrade( + client: &mut C, + upstream: &mut U, + overflow: Vec, + host: &str, + port: u16, +) -> Result<()> +where + C: AsyncRead + AsyncWrite + Unpin + Send, + U: AsyncRead + AsyncWrite + Unpin + Send, +{ + info!( + host = %host, + port = port, + overflow_bytes = overflow.len(), + "101 Switching Protocols — switching to raw bidirectional relay \ + (L7 enforcement no longer active)" + ); + if !overflow.is_empty() { + client.write_all(&overflow).await.into_diagnostic()?; + client.flush().await.into_diagnostic()?; + } + tokio::io::copy_bidirectional(client, upstream) + .await + .into_diagnostic()?; + Ok(()) +} + /// REST relay loop: parse request -> evaluate -> allow/deny -> relay response -> repeat. async fn relay_rest( config: &L7EndpointConfig, @@ -137,10 +171,24 @@ where // Evaluate L7 policy via Rego (using redacted target) let (allowed, reason) = evaluate_l7_request(engine, ctx, &request_info)?; - let decision_str = match (allowed, config.enforcement) { - (true, _) => "allow", - (false, EnforcementMode::Audit) => "audit", - (false, EnforcementMode::Enforce) => "deny", + // Check if this is an upgrade request for logging purposes. + let header_end = req + .raw_header + .windows(4) + .position(|w| w == b"\r\n\r\n") + .map_or(req.raw_header.len(), |p| p + 4); + let is_upgrade_request = { + let h = String::from_utf8_lossy(&req.raw_header[..header_end]); + h.lines() + .skip(1) + .any(|l| l.to_ascii_lowercase().starts_with("upgrade:")) + }; + + let decision_str = match (allowed, config.enforcement, is_upgrade_request) { + (true, _, true) => "allow_upgrade", + (true, _, false) => "allow", + (false, EnforcementMode::Audit, _) => "audit", + (false, EnforcementMode::Enforce, _) => "deny", }; // Log every L7 decision (using redacted target — never log real secrets) @@ -162,20 +210,26 @@ where if allowed || config.enforcement == EnforcementMode::Audit { // Forward request to upstream and relay response - let reusable = crate::l7::rest::relay_http_request_with_resolver( + let outcome = crate::l7::rest::relay_http_request_with_resolver( &req, client, upstream, ctx.secret_resolver.as_deref(), ) .await?; - if !reusable { - debug!( - host = %ctx.host, - port = ctx.port, - "Upstream connection not reusable, closing L7 relay" - ); - return Ok(()); + match outcome { + RelayOutcome::Reusable => {} // continue loop + RelayOutcome::Consumed => { + debug!( + host = %ctx.host, + port = ctx.port, + "Upstream connection not reusable, closing L7 relay" + ); + return Ok(()); + } + RelayOutcome::Upgraded { overflow } => { + return handle_upgrade(client, upstream, overflow, &ctx.host, ctx.port).await; + } } } else { // Enforce mode: deny with 403 and close connection (use redacted target) @@ -334,12 +388,16 @@ where // Forward request with credential rewriting and relay the response. // relay_http_request_with_resolver handles both directions: it sends // the request upstream and reads the response back to the client. - let reusable = + let outcome = crate::l7::rest::relay_http_request_with_resolver(&req, client, upstream, resolver) .await?; - if !reusable { - break; + match outcome { + RelayOutcome::Reusable => {} // continue loop + RelayOutcome::Consumed => break, + RelayOutcome::Upgraded { overflow } => { + return handle_upgrade(client, upstream, overflow, &ctx.host, ctx.port).await; + } } } diff --git a/crates/openshell-sandbox/src/l7/rest.rs b/crates/openshell-sandbox/src/l7/rest.rs index ec5494c9d..0c136be79 100644 --- a/crates/openshell-sandbox/src/l7/rest.rs +++ b/crates/openshell-sandbox/src/l7/rest.rs @@ -7,12 +7,12 @@ //! policy, and relays allowed requests to upstream. Handles Content-Length //! and chunked transfer encoding for body framing. -use crate::l7::provider::{BodyLength, L7Provider, L7Request}; +use crate::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; use crate::secrets::rewrite_http_header_block; use miette::{IntoDiagnostic, Result, miette}; use std::collections::HashMap; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; -use tracing::debug; +use tracing::{debug, warn}; const MAX_HEADER_BYTES: usize = 16384; // 16 KiB for HTTP headers const RELAY_BUF_SIZE: usize = 8192; @@ -32,7 +32,12 @@ impl L7Provider for RestProvider { parse_http_request(client).await } - async fn relay(&self, req: &L7Request, client: &mut C, upstream: &mut U) -> Result + async fn relay( + &self, + req: &L7Request, + client: &mut C, + upstream: &mut U, + ) -> Result where C: AsyncRead + AsyncWrite + Unpin + Send, U: AsyncRead + AsyncWrite + Unpin + Send, @@ -236,8 +241,13 @@ fn decode_hex_nibble(byte: u8) -> Option { /// Forward an allowed HTTP request to upstream and relay the response back. /// -/// Returns `true` if the upstream connection is reusable, `false` if consumed. -async fn relay_http_request(req: &L7Request, client: &mut C, upstream: &mut U) -> Result +/// Returns the relay outcome indicating whether the connection is reusable, +/// consumed, or has been upgraded (e.g. WebSocket via 101 Switching Protocols). +async fn relay_http_request( + req: &L7Request, + client: &mut C, + upstream: &mut U, +) -> Result where C: AsyncRead + AsyncWrite + Unpin, U: AsyncRead + AsyncWrite + Unpin, @@ -250,7 +260,7 @@ pub(crate) async fn relay_http_request_with_resolver( client: &mut C, upstream: &mut U, resolver: Option<&crate::secrets::SecretResolver>, -) -> Result +) -> Result where C: AsyncRead + AsyncWrite + Unpin, U: AsyncRead + AsyncWrite + Unpin, @@ -288,8 +298,27 @@ where BodyLength::None => {} } upstream.flush().await.into_diagnostic()?; - let (reusable, _) = relay_response(&req.action, upstream, client).await?; - Ok(reusable) + + let outcome = relay_response(&req.action, upstream, client).await?; + + // Validate that the client actually requested an upgrade before accepting + // a 101 from upstream. Per RFC 9110 Section 7.8, the server MUST NOT send + // 101 unless the client sent Upgrade + Connection: Upgrade headers. A + // non-compliant or malicious upstream could send an unsolicited 101 to + // bypass L7 inspection. + if matches!(outcome, RelayOutcome::Upgraded { .. }) { + let header_str = String::from_utf8_lossy(&req.raw_header[..header_end]); + if !client_requested_upgrade(&header_str) { + warn!( + method = %req.action, + target = %req.target, + "upstream sent unsolicited 101 without client Upgrade request — closing connection" + ); + return Ok(RelayOutcome::Consumed); + } + } + + Ok(outcome) } /// Send a 403 Forbidden JSON deny response. @@ -525,29 +554,28 @@ fn find_crlf(buf: &[u8], start: usize) -> Option { /// Read and relay a full HTTP response (headers + body) from upstream to client. /// -/// Returns `true` if the upstream connection is reusable (keep-alive), -/// `false` if it was consumed (read-until-EOF or `Connection: close`). -/// Relay an HTTP response from upstream back to the client. +/// Returns a [`RelayOutcome`] indicating whether the connection is reusable, +/// consumed, or has been upgraded (101 Switching Protocols). /// -/// Returns `true` if the connection should stay alive for further requests. +/// Note: callers that receive `Upgraded` are responsible for switching to +/// raw bidirectional relay and forwarding the overflow bytes. pub(crate) async fn relay_response_to_client( upstream: &mut U, client: &mut C, request_method: &str, -) -> Result +) -> Result where U: AsyncRead + Unpin, C: AsyncWrite + Unpin, { - let (reusable, _status) = relay_response(request_method, upstream, client).await?; - Ok(reusable) + relay_response(request_method, upstream, client).await } async fn relay_response( request_method: &str, upstream: &mut U, client: &mut C, -) -> Result<(bool, u16)> +) -> Result where U: AsyncRead + Unpin, C: AsyncWrite + Unpin, @@ -568,7 +596,7 @@ where if !buf.is_empty() { client.write_all(&buf).await.into_diagnostic()?; } - return Ok((false, 0)); + return Ok(RelayOutcome::Consumed); } buf.extend_from_slice(&tmp[..n]); @@ -594,6 +622,26 @@ where "relay_response framing" ); + // 101 Switching Protocols: the connection has been upgraded (e.g. to + // WebSocket). Forward the 101 headers to the client and signal the + // caller to switch to raw bidirectional TCP relay. Any bytes read + // from upstream beyond the headers are overflow that belong to the + // upgraded protocol and must be forwarded before switching. + if status_code == 101 { + client + .write_all(&buf[..header_end]) + .await + .into_diagnostic()?; + client.flush().await.into_diagnostic()?; + let overflow = buf[header_end..].to_vec(); + debug!( + request_method, + overflow_bytes = overflow.len(), + "101 Switching Protocols — signaling protocol upgrade" + ); + return Ok(RelayOutcome::Upgraded { overflow }); + } + // Bodiless responses (HEAD, 1xx, 204, 304): forward headers only, skip body if is_bodiless_response(request_method, status_code) { client @@ -601,7 +649,11 @@ where .await .into_diagnostic()?; client.flush().await.into_diagnostic()?; - return Ok((!server_wants_close, status_code)); + return if server_wants_close { + Ok(RelayOutcome::Consumed) + } else { + Ok(RelayOutcome::Reusable) + }; } // No explicit framing (no Content-Length, no Transfer-Encoding). @@ -621,7 +673,7 @@ where } relay_until_eof(upstream, client).await?; client.flush().await.into_diagnostic()?; - return Ok((false, status_code)); + return Ok(RelayOutcome::Consumed); } // No Connection: close — an HTTP/1.1 keep-alive server that omits // framing headers has an empty body. Forward headers and continue @@ -632,7 +684,7 @@ where .await .into_diagnostic()?; client.flush().await.into_diagnostic()?; - return Ok((true, status_code)); + return Ok(RelayOutcome::Reusable); } // Forward response headers + any overflow body bytes @@ -665,7 +717,7 @@ where // loop will exit via the normal error path. Exiting early here would // tear down the CONNECT tunnel before the client can detect the close, // causing ~30 s retry delays in clients like `gh`. - Ok((true, status_code)) + Ok(RelayOutcome::Reusable) } /// Parse the HTTP status code from a response status line. @@ -689,6 +741,33 @@ fn parse_connection_close(headers: &str) -> bool { false } +/// Check if the client request headers contain both `Upgrade` and +/// `Connection: Upgrade` headers, indicating the client requested a +/// protocol upgrade (e.g. WebSocket). +/// +/// Per RFC 9110 Section 7.8, a server MUST NOT send 101 Switching Protocols +/// unless the client sent these headers. +fn client_requested_upgrade(headers: &str) -> bool { + let mut has_upgrade_header = false; + let mut connection_contains_upgrade = false; + + for line in headers.lines().skip(1) { + let lower = line.to_ascii_lowercase(); + if lower.starts_with("upgrade:") { + has_upgrade_header = true; + } + if lower.starts_with("connection:") { + let val = lower.split_once(':').map_or("", |(_, v)| v.trim()); + // Connection header can have comma-separated values + if val.split(',').any(|tok| tok.trim() == "upgrade") { + connection_contains_upgrade = true; + } + } + } + + has_upgrade_header && connection_contains_upgrade +} + /// Returns true for responses that MUST NOT contain a message body per RFC 7230 §3.3.3: /// HEAD responses, 1xx informational, 204 No Content, 304 Not Modified. fn is_bodiless_response(request_method: &str, status_code: u16) -> bool { @@ -1136,8 +1215,11 @@ mod tests { .await .expect("relay_response should not deadlock"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(!reusable, "connection consumed by read-until-EOF"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Consumed), + "connection consumed by read-until-EOF" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1174,8 +1256,11 @@ mod tests { .await .expect("must not block when no Connection: close"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "keep-alive implied, connection reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "keep-alive implied, connection reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1207,8 +1292,11 @@ mod tests { .await .expect("HEAD relay must not deadlock waiting for body"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "HEAD response should be reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "HEAD response should be reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1237,8 +1325,11 @@ mod tests { .await .expect("204 relay must not deadlock"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "204 response should be reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "204 response should be reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1269,8 +1360,11 @@ mod tests { .await .expect("must not block when chunked body is complete in overflow"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "connection should be reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "connection should be reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1305,8 +1399,11 @@ mod tests { .await .expect("must not block when chunked response has trailers"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "chunked response should be reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "chunked response should be reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1340,8 +1437,11 @@ mod tests { .await .expect("normal relay must not deadlock"); - let (reusable, _status) = result.expect("relay_response should succeed"); - assert!(reusable, "Content-Length response should be reusable"); + let outcome = result.expect("relay_response should succeed"); + assert!( + matches!(outcome, RelayOutcome::Reusable), + "Content-Length response should be reusable" + ); client_write.shutdown().await.unwrap(); let mut received = Vec::new(); @@ -1368,12 +1468,12 @@ mod tests { .await .expect("relay must not deadlock"); - let (reusable, _status) = result.expect("relay_response should succeed"); + let outcome = result.expect("relay_response should succeed"); // With explicit framing, Connection: close is still reported as reusable // so the relay loop continues. The *next* upstream write will fail and // exit the loop via the normal error path. assert!( - reusable, + matches!(outcome, RelayOutcome::Reusable), "explicit framing keeps loop alive despite Connection: close" ); @@ -1383,6 +1483,224 @@ mod tests { assert!(String::from_utf8_lossy(&received).contains("hello")); } + #[tokio::test] + async fn relay_response_101_switching_protocols_returns_upgraded_with_overflow() { + // Build a 101 response followed by WebSocket frame data (overflow). + let mut response = Vec::new(); + response.extend_from_slice(b"HTTP/1.1 101 Switching Protocols\r\n"); + response.extend_from_slice(b"Upgrade: websocket\r\n"); + response.extend_from_slice(b"Connection: Upgrade\r\n"); + response.extend_from_slice(b"\r\n"); + response.extend_from_slice(b"\x81\x05hello"); // WebSocket frame + + let (upstream_read, mut upstream_write) = tokio::io::duplex(4096); + let (mut client_read, client_write) = tokio::io::duplex(4096); + + upstream_write.write_all(&response).await.unwrap(); + drop(upstream_write); + + let mut upstream_read = upstream_read; + let mut client_write = client_write; + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + relay_response("GET", &mut upstream_read, &mut client_write), + ) + .await + .expect("relay_response should not deadlock"); + + let outcome = result.expect("relay_response should succeed"); + match outcome { + RelayOutcome::Upgraded { overflow } => { + assert_eq!( + &overflow, b"\x81\x05hello", + "overflow should contain WebSocket frame data" + ); + } + other => panic!("Expected Upgraded, got {other:?}"), + } + + client_write.shutdown().await.unwrap(); + let mut received = Vec::new(); + client_read.read_to_end(&mut received).await.unwrap(); + let received_str = String::from_utf8_lossy(&received); + assert!( + received_str.contains("101 Switching Protocols"), + "client should receive the 101 response headers" + ); + } + + #[tokio::test] + async fn relay_response_101_no_overflow() { + // 101 response with no trailing bytes — overflow should be empty. + let response = b"HTTP/1.1 101 Switching Protocols\r\nUpgrade: websocket\r\nConnection: Upgrade\r\n\r\n"; + + let (upstream_read, mut upstream_write) = tokio::io::duplex(4096); + let (_client_read, client_write) = tokio::io::duplex(4096); + + upstream_write.write_all(response).await.unwrap(); + drop(upstream_write); + + let mut upstream_read = upstream_read; + let mut client_write = client_write; + + let result = tokio::time::timeout( + std::time::Duration::from_secs(2), + relay_response("GET", &mut upstream_read, &mut client_write), + ) + .await + .expect("relay_response should not deadlock"); + + match result.expect("should succeed") { + RelayOutcome::Upgraded { overflow } => { + assert!(overflow.is_empty(), "no overflow expected"); + } + other => panic!("Expected Upgraded, got {other:?}"), + } + } + + #[tokio::test] + async fn relay_rejects_unsolicited_101_without_client_upgrade_header() { + // Client sends a normal GET without Upgrade headers. + // Upstream responds with 101 (non-compliant). The relay should + // reject the upgrade and return Consumed instead. + let (mut proxy_to_upstream, mut upstream_side) = tokio::io::duplex(8192); + let (mut _app_side, mut proxy_to_client) = tokio::io::duplex(8192); + + let req = L7Request { + action: "GET".to_string(), + target: "/api".to_string(), + query_params: HashMap::new(), + raw_header: b"GET /api HTTP/1.1\r\nHost: example.com\r\n\r\n".to_vec(), + body_length: BodyLength::None, + }; + + let upstream_task = tokio::spawn(async move { + // Read the request + let mut buf = vec![0u8; 4096]; + let mut total = 0; + loop { + let n = upstream_side.read(&mut buf[total..]).await.unwrap(); + if n == 0 { + break; + } + total += n; + if buf[..total].windows(4).any(|w| w == b"\r\n\r\n") { + break; + } + } + // Send unsolicited 101 + upstream_side + .write_all( + b"HTTP/1.1 101 Switching Protocols\r\nUpgrade: websocket\r\nConnection: Upgrade\r\n\r\n", + ) + .await + .unwrap(); + upstream_side.flush().await.unwrap(); + }); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(5), + relay_http_request_with_resolver( + &req, + &mut proxy_to_client, + &mut proxy_to_upstream, + None, + ), + ) + .await + .expect("relay must not deadlock"); + + let outcome = result.expect("relay should succeed"); + assert!( + matches!(outcome, RelayOutcome::Consumed), + "unsolicited 101 should be rejected as Consumed, got {outcome:?}" + ); + + upstream_task.await.expect("upstream task should complete"); + } + + #[tokio::test] + async fn relay_accepts_101_with_client_upgrade_header() { + // Client sends a proper upgrade request with Upgrade + Connection headers. + let (mut proxy_to_upstream, mut upstream_side) = tokio::io::duplex(8192); + let (mut _app_side, mut proxy_to_client) = tokio::io::duplex(8192); + + let req = L7Request { + action: "GET".to_string(), + target: "/ws".to_string(), + query_params: HashMap::new(), + raw_header: b"GET /ws HTTP/1.1\r\nHost: example.com\r\nUpgrade: websocket\r\nConnection: Upgrade\r\n\r\n".to_vec(), + body_length: BodyLength::None, + }; + + let upstream_task = tokio::spawn(async move { + let mut buf = vec![0u8; 4096]; + let mut total = 0; + loop { + let n = upstream_side.read(&mut buf[total..]).await.unwrap(); + if n == 0 { + break; + } + total += n; + if buf[..total].windows(4).any(|w| w == b"\r\n\r\n") { + break; + } + } + upstream_side + .write_all( + b"HTTP/1.1 101 Switching Protocols\r\nUpgrade: websocket\r\nConnection: Upgrade\r\n\r\n", + ) + .await + .unwrap(); + upstream_side.flush().await.unwrap(); + }); + + let result = tokio::time::timeout( + std::time::Duration::from_secs(5), + relay_http_request_with_resolver( + &req, + &mut proxy_to_client, + &mut proxy_to_upstream, + None, + ), + ) + .await + .expect("relay must not deadlock"); + + let outcome = result.expect("relay should succeed"); + assert!( + matches!(outcome, RelayOutcome::Upgraded { .. }), + "proper upgrade request should be accepted, got {outcome:?}" + ); + + upstream_task.await.expect("upstream task should complete"); + } + + #[test] + fn client_requested_upgrade_detects_websocket_headers() { + let headers = "GET /ws HTTP/1.1\r\nHost: example.com\r\nUpgrade: websocket\r\nConnection: Upgrade\r\n\r\n"; + assert!(client_requested_upgrade(headers)); + } + + #[test] + fn client_requested_upgrade_rejects_missing_upgrade_header() { + let headers = "GET /api HTTP/1.1\r\nHost: example.com\r\n\r\n"; + assert!(!client_requested_upgrade(headers)); + } + + #[test] + fn client_requested_upgrade_rejects_upgrade_without_connection() { + let headers = "GET /ws HTTP/1.1\r\nHost: example.com\r\nUpgrade: websocket\r\n\r\n"; + assert!(!client_requested_upgrade(headers)); + } + + #[test] + fn client_requested_upgrade_handles_comma_separated_connection() { + let headers = "GET /ws HTTP/1.1\r\nHost: example.com\r\nUpgrade: websocket\r\nConnection: keep-alive, Upgrade\r\n\r\n"; + assert!(client_requested_upgrade(headers)); + } + #[test] fn rewrite_header_block_resolves_placeholder_auth_headers() { let (_, resolver) = SecretResolver::from_provider_env( diff --git a/crates/openshell-sandbox/tests/websocket_upgrade.rs b/crates/openshell-sandbox/tests/websocket_upgrade.rs new file mode 100644 index 000000000..ec226c9cf --- /dev/null +++ b/crates/openshell-sandbox/tests/websocket_upgrade.rs @@ -0,0 +1,259 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration test: WebSocket upgrade through the L7 relay. +//! +//! Spins up a dummy WebSocket echo server, connects a client through the +//! `L7Provider::relay` pipeline, validates the 101 upgrade succeeds, and +//! exchanges a WebSocket text frame bidirectionally. +//! +//! This test exercises the full upgrade path described in issue #652: +//! 1. Client sends HTTP GET with `Upgrade: websocket` headers +//! 2. Relay forwards to upstream, upstream responds with 101 +//! 3. Relay detects 101, validates client Upgrade headers, returns `Upgraded` +//! 4. Caller forwards overflow + switches to `copy_bidirectional` +//! 5. Client and server exchange a WebSocket text message +//! +//! Reproduction scenario from #652: raw socket test sends upgrade request +//! through the proxy, receives 101, then verifies WebSocket frames flow. + +use futures::SinkExt; +use futures::stream::StreamExt; +use openshell_sandbox::l7::provider::{BodyLength, L7Provider, L7Request, RelayOutcome}; +use openshell_sandbox::l7::rest::RestProvider; +use std::collections::HashMap; +use std::net::SocketAddr; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_tungstenite::accept_async; +use tokio_tungstenite::tungstenite::Message; + +/// Start a minimal WebSocket echo server on an ephemeral port. +async fn start_ws_echo_server() -> SocketAddr { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + tokio::spawn(async move { + let (stream, _) = listener.accept().await.unwrap(); + let ws_stream = accept_async(stream).await.unwrap(); + let (mut write, mut read) = ws_stream.split(); + + while let Some(msg) = read.next().await { + match msg { + Ok(Message::Text(text)) => { + write + .send(Message::Text(format!("echo: {text}").into())) + .await + .unwrap(); + } + Ok(Message::Close(_)) => break, + Ok(_) => {} + Err(_) => break, + } + } + }); + + addr +} + +/// Build raw HTTP upgrade request bytes (mimics the reproduction script from #652). +fn build_ws_upgrade_request(host: &str) -> Vec { + format!( + "GET / HTTP/1.1\r\n\ + Host: {host}\r\n\ + Upgrade: websocket\r\n\ + Connection: Upgrade\r\n\ + Sec-WebSocket-Key: RylUQAh3p5cysfOlexgubw==\r\n\ + Sec-WebSocket-Version: 13\r\n\ + \r\n" + ) + .into_bytes() +} + +/// Build a masked WebSocket text frame (client -> server must be masked per RFC 6455). +fn build_ws_text_frame(payload: &[u8]) -> Vec { + let mask_key: [u8; 4] = [0x37, 0xfa, 0x21, 0x3d]; + let mut frame = Vec::new(); + frame.push(0x81); // FIN + text opcode + frame.push(0x80 | payload.len() as u8); // masked + length + frame.extend_from_slice(&mask_key); + for (i, b) in payload.iter().enumerate() { + frame.push(b ^ mask_key[i % 4]); + } + frame +} + +/// Core test: WebSocket upgrade through `L7Provider::relay`, then exchange a message. +/// +/// This mirrors the reproduction steps from issue #652: +/// - Send WebSocket upgrade → receive 101 → verify frames flow bidirectionally +/// - Previously, 101 was treated as a generic 1xx and frames were dropped +#[tokio::test] +async fn websocket_upgrade_through_l7_relay_exchanges_message() { + let ws_addr = start_ws_echo_server().await; + + // Open a real TCP connection to the WebSocket server (simulates upstream) + let mut upstream = TcpStream::connect(ws_addr).await.unwrap(); + + // In-memory duplex for the client side of the relay + let (mut client_app, mut client_proxy) = tokio::io::duplex(8192); + + let host = format!("127.0.0.1:{}", ws_addr.port()); + let raw_header = build_ws_upgrade_request(&host); + + let req = L7Request { + action: "GET".to_string(), + target: "/".to_string(), + query_params: HashMap::new(), + raw_header, + body_length: BodyLength::None, + }; + + // Run the relay in a background task (simulates what relay_rest does) + let relay_handle = tokio::spawn(async move { + let outcome = RestProvider + .relay(&req, &mut client_proxy, &mut upstream) + .await + .expect("relay should succeed"); + + match outcome { + RelayOutcome::Upgraded { overflow } => { + // This is what handle_upgrade() does in relay.rs + if !overflow.is_empty() { + client_proxy.write_all(&overflow).await.unwrap(); + client_proxy.flush().await.unwrap(); + } + let _ = tokio::io::copy_bidirectional(&mut client_proxy, &mut upstream).await; + } + other => panic!("Expected Upgraded, got {other:?}"), + } + }); + + // Client side: read the 101 response headers byte-by-byte + // (mirrors the reproduction script's recv() after sending the upgrade) + let mut response_buf = Vec::new(); + let mut tmp = [0u8; 1]; + tokio::time::timeout(std::time::Duration::from_secs(5), async { + loop { + client_app.read_exact(&mut tmp).await.unwrap(); + response_buf.push(tmp[0]); + if response_buf.ends_with(b"\r\n\r\n") { + break; + } + } + }) + .await + .expect("should receive 101 headers within 5 seconds"); + + let response_str = String::from_utf8_lossy(&response_buf); + assert!( + response_str.contains("101 Switching Protocols"), + "should receive 101, got: {response_str}" + ); + + // ---- This is the part that was broken before the fix (issue #652) ---- + // Previously, after 101, the relay re-entered the HTTP parsing loop and + // all WebSocket frames were silently dropped. The reproduction script + // would see RECV2: TIMEOUT here. + + // Send a WebSocket text frame + let frame = build_ws_text_frame(b"hello"); + client_app.write_all(&frame).await.unwrap(); + client_app.flush().await.unwrap(); + + // Read the echo response (unmasked server -> client frame) + tokio::time::timeout(std::time::Duration::from_secs(5), async { + let mut header = [0u8; 2]; + client_app.read_exact(&mut header).await.unwrap(); + + let fin_opcode = header[0]; + assert_eq!(fin_opcode & 0x0F, 1, "should be text frame"); + assert!(fin_opcode & 0x80 != 0, "FIN bit should be set"); + + let len = (header[1] & 0x7F) as usize; + let mut payload_buf = vec![0u8; len]; + client_app.read_exact(&mut payload_buf).await.unwrap(); + let text = String::from_utf8(payload_buf).unwrap(); + assert_eq!( + text, "echo: hello", + "server should echo our message back through the relay" + ); + }) + .await + .expect("should receive WebSocket echo within 5 seconds (previously timed out per #652)"); + + // Clean shutdown + let close_frame = [0x88, 0x82, 0x00, 0x00, 0x00, 0x00, 0x03, 0xe8]; + let _ = client_app.write_all(&close_frame).await; + drop(client_app); + + let _ = tokio::time::timeout(std::time::Duration::from_secs(2), relay_handle).await; +} + +/// Test that a normal (non-upgrade) HTTP request still works correctly +/// after the relay_response changes. Ensures the 101 detection doesn't +/// break regular HTTP traffic. +#[tokio::test] +async fn normal_http_request_still_works_after_relay_changes() { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + // Simple HTTP echo server + tokio::spawn(async move { + let (mut stream, _) = listener.accept().await.unwrap(); + let mut buf = vec![0u8; 4096]; + let mut total = 0; + loop { + let n = stream.read(&mut buf[total..]).await.unwrap(); + if n == 0 { + break; + } + total += n; + if buf[..total].windows(4).any(|w| w == b"\r\n\r\n") { + break; + } + } + stream + .write_all(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nok") + .await + .unwrap(); + stream.flush().await.unwrap(); + }); + + let mut upstream = TcpStream::connect(addr).await.unwrap(); + let (mut client_read, mut client_proxy) = tokio::io::duplex(8192); + + let raw_header = format!( + "GET /api HTTP/1.1\r\nHost: 127.0.0.1:{}\r\n\r\n", + addr.port() + ) + .into_bytes(); + + let req = L7Request { + action: "GET".to_string(), + target: "/api".to_string(), + query_params: HashMap::new(), + raw_header, + body_length: BodyLength::None, + }; + + let outcome = tokio::time::timeout( + std::time::Duration::from_secs(5), + RestProvider.relay(&req, &mut client_proxy, &mut upstream), + ) + .await + .expect("should not deadlock") + .expect("relay should succeed"); + + assert!( + matches!(outcome, RelayOutcome::Reusable), + "normal 200 response should be Reusable, got {outcome:?}" + ); + + client_proxy.shutdown().await.unwrap(); + let mut received = Vec::new(); + client_read.read_to_end(&mut received).await.unwrap(); + let body = String::from_utf8_lossy(&received); + assert!(body.contains("200 OK"), "should forward 200 response"); + assert!(body.contains("ok"), "should forward response body"); +} From 7eb1df64bfc623c0765754b06ff3ee2570b0c270 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 2 Apr 2026 08:42:05 -0700 Subject: [PATCH 35/45] fix(cli): sandbox upload overwrites files instead of creating directories (#694) * fix(cli): sandbox upload overwrites files instead of creating directories When uploading a single file to an existing file path, sandbox_sync_up unconditionally ran mkdir -p on the destination, turning it into a directory. Split the destination into parent + basename for single-file uploads so tar extracts with the correct filename, matching cp/scp semantics. Consolidates duplicated SSH/tar boilerplate from sandbox_sync_up and sandbox_sync_up_files into a shared ssh_tar_upload helper. Closes #667 * better path defaults --- crates/openshell-cli/src/main.rs | 11 +- crates/openshell-cli/src/run.rs | 14 +- crates/openshell-cli/src/ssh.rs | 268 +++++++++++++++++++++---------- 3 files changed, 199 insertions(+), 94 deletions(-) diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 5f10ef903..c81476452 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1091,8 +1091,8 @@ enum SandboxCommands { /// Upload local files into the sandbox before running. /// /// Format: `[:]`. - /// When `SANDBOX_PATH` is omitted, files are uploaded to the container - /// working directory (`/sandbox`). + /// When `SANDBOX_PATH` is omitted, files are uploaded to the container's + /// working directory. /// `.gitignore` rules are applied by default; use `--no-git-ignore` to /// upload everything. #[arg(long, value_hint = ValueHint::AnyPath, help_heading = "UPLOAD FLAGS")] @@ -1255,7 +1255,7 @@ enum SandboxCommands { #[arg(value_hint = ValueHint::AnyPath)] local_path: String, - /// Destination path in the sandbox (defaults to `/sandbox`). + /// Destination path in the sandbox (defaults to the container's working directory). dest: Option, /// Disable `.gitignore` filtering (uploads everything). @@ -2224,7 +2224,7 @@ async fn main() -> Result<()> { let ctx = resolve_gateway(&cli.gateway, &cli.gateway_endpoint)?; let mut tls = tls.with_gateway_name(&ctx.name); apply_edge_auth(&mut tls, &ctx.name); - let sandbox_dest = dest.as_deref().unwrap_or("/sandbox"); + let sandbox_dest = dest.as_deref(); let local = std::path::Path::new(&local_path); if !local.exists() { return Err(miette::miette!( @@ -2232,7 +2232,8 @@ async fn main() -> Result<()> { local.display() )); } - eprintln!("Uploading {} -> sandbox:{}", local.display(), sandbox_dest); + let dest_display = sandbox_dest.unwrap_or("~"); + eprintln!("Uploading {} -> sandbox:{}", local.display(), dest_display); if !no_git_ignore && let Ok((base_dir, files)) = run::git_sync_files(local) { run::sandbox_sync_up_files( &ctx.endpoint, diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c4f2833d2..b3ec05383 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -2309,8 +2309,12 @@ pub async fn sandbox_create( drop(client); if let Some((local_path, sandbox_path, git_ignore)) = upload { - let dest = sandbox_path.as_deref().unwrap_or("/sandbox"); - eprintln!(" {} Uploading files to {dest}...", "\u{2022}".dimmed(),); + let dest = sandbox_path.as_deref(); + let dest_display = dest.unwrap_or("~"); + eprintln!( + " {} Uploading files to {dest_display}...", + "\u{2022}".dimmed(), + ); let local = Path::new(local_path); if *git_ignore && let Ok((base_dir, files)) = git_sync_files(local) { sandbox_sync_up_files( @@ -2628,7 +2632,6 @@ pub async fn sandbox_sync_command( ) -> Result<()> { match (up, down) { (Some(local_path), None) => { - let sandbox_dest = dest.unwrap_or("/sandbox"); let local = Path::new(local_path); if !local.exists() { return Err(miette::miette!( @@ -2636,8 +2639,9 @@ pub async fn sandbox_sync_command( local.display() )); } - eprintln!("Syncing {} -> sandbox:{}", local.display(), sandbox_dest); - sandbox_sync_up(server, name, local, sandbox_dest, tls).await?; + let dest_display = dest.unwrap_or("~"); + eprintln!("Syncing {} -> sandbox:{}", local.display(), dest_display); + sandbox_sync_up(server, name, local, dest, tls).await?; eprintln!("{} Sync complete", "✓".green().bold()); } (None, Some(sandbox_path)) => { diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index 4b284bff1..79fb64fb7 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -447,33 +447,51 @@ pub(crate) async fn sandbox_exec_without_exec( sandbox_exec_with_mode(server, name, command, tty, tls, false).await } -/// Push a list of files from a local directory into a sandbox using tar-over-SSH. +/// What to pack into the tar archive streamed to the sandbox. +enum UploadSource { + /// A single local file or directory. `tar_name` controls the entry name + /// inside the archive (e.g. the target basename for file-to-file uploads). + SinglePath { + local_path: PathBuf, + tar_name: std::ffi::OsString, + }, + /// A set of files relative to a base directory (git-filtered uploads). + FileList { + base_dir: PathBuf, + files: Vec, + }, +} + +/// Core tar-over-SSH upload: streams a tar archive into `dest_dir` on the +/// sandbox. Callers are responsible for splitting the destination path so +/// that `dest_dir` is always a directory. /// -/// This replaces the old rsync-based sync. Files are streamed as a tar archive -/// to `ssh ... tar xf - -C ` on the sandbox side. -pub async fn sandbox_sync_up_files( +/// When `dest_dir` is `None`, the sandbox user's home directory (`$HOME`) is +/// used as the extraction target. This avoids hard-coding any particular +/// path and works for custom container images with non-default `WORKDIR`. +async fn ssh_tar_upload( server: &str, name: &str, - base_dir: &Path, - files: &[String], - dest: &str, + dest_dir: Option<&str>, + source: UploadSource, tls: &TlsOptions, ) -> Result<()> { - if files.is_empty() { - return Ok(()); - } - let session = ssh_session_config(server, name, tls).await?; + // When no explicit destination is given, use the unescaped `$HOME` shell + // variable so the remote shell resolves it at runtime. + let escaped_dest = match dest_dir { + Some(d) => shell_escape(d), + None => "$HOME".to_string(), + }; + let mut ssh = ssh_base_command(&session.proxy_command); ssh.arg("-T") .arg("-o") .arg("RequestTTY=no") .arg("sandbox") .arg(format!( - "mkdir -p {} && cat | tar xf - -C {}", - shell_escape(dest), - shell_escape(dest) + "mkdir -p {escaped_dest} && cat | tar xf - -C {escaped_dest}", )) .stdin(Stdio::piped()) .stdout(Stdio::inherit()) @@ -486,22 +504,43 @@ pub async fn sandbox_sync_up_files( .ok_or_else(|| miette::miette!("failed to open stdin for ssh process"))?; // Build the tar archive in a blocking task since the tar crate is synchronous. - let base_dir = base_dir.to_path_buf(); - let files = files.to_vec(); tokio::task::spawn_blocking(move || -> Result<()> { let mut archive = tar::Builder::new(stdin); - for file in &files { - let full_path = base_dir.join(file); - if full_path.is_file() { - archive - .append_path_with_name(&full_path, file) - .into_diagnostic() - .wrap_err_with(|| format!("failed to add {file} to tar archive"))?; - } else if full_path.is_dir() { - archive - .append_dir_all(file, &full_path) - .into_diagnostic() - .wrap_err_with(|| format!("failed to add directory {file} to tar archive"))?; + match source { + UploadSource::SinglePath { + local_path, + tar_name, + } => { + if local_path.is_file() { + archive + .append_path_with_name(&local_path, &tar_name) + .into_diagnostic()?; + } else if local_path.is_dir() { + archive.append_dir_all(".", &local_path).into_diagnostic()?; + } else { + return Err(miette::miette!( + "local path does not exist: {}", + local_path.display() + )); + } + } + UploadSource::FileList { base_dir, files } => { + for file in &files { + let full_path = base_dir.join(file); + if full_path.is_file() { + archive + .append_path_with_name(&full_path, file) + .into_diagnostic() + .wrap_err_with(|| format!("failed to add {file} to tar archive"))?; + } else if full_path.is_dir() { + archive + .append_dir_all(file, &full_path) + .into_diagnostic() + .wrap_err_with(|| { + format!("failed to add directory {file} to tar archive") + })?; + } + } } } archive.finish().into_diagnostic()?; @@ -524,72 +563,112 @@ pub async fn sandbox_sync_up_files( Ok(()) } +/// Split a sandbox path into (parent_directory, basename). +/// +/// Examples: +/// `"/sandbox/.bashrc"` -> `("/sandbox", ".bashrc")` +/// `"/sandbox/sub/file"` -> `("/sandbox/sub", "file")` +/// `"file.txt"` -> `(".", "file.txt")` +fn split_sandbox_path(path: &str) -> (&str, &str) { + match path.rfind('/') { + Some(0) => ("/", &path[1..]), + Some(pos) => (&path[..pos], &path[pos + 1..]), + None => (".", path), + } +} + +/// Push a list of files from a local directory into a sandbox using tar-over-SSH. +/// +/// Files are streamed as a tar archive to `ssh ... tar xf - -C ` on +/// the sandbox side. When `dest` is `None`, files are uploaded to the +/// sandbox user's home directory. +pub async fn sandbox_sync_up_files( + server: &str, + name: &str, + base_dir: &Path, + files: &[String], + dest: Option<&str>, + tls: &TlsOptions, +) -> Result<()> { + if files.is_empty() { + return Ok(()); + } + ssh_tar_upload( + server, + name, + dest, + UploadSource::FileList { + base_dir: base_dir.to_path_buf(), + files: files.to_vec(), + }, + tls, + ) + .await +} + /// Push a local path (file or directory) into a sandbox using tar-over-SSH. +/// +/// When `sandbox_path` is `None`, files are uploaded to the sandbox user's +/// home directory. When uploading a single file to an explicit destination +/// that does not end with `/`, the destination is treated as a file path: +/// the parent directory is created and the file is written with the +/// destination's basename. This matches `cp` / `scp` semantics. pub async fn sandbox_sync_up( server: &str, name: &str, local_path: &Path, - sandbox_path: &str, + sandbox_path: Option<&str>, tls: &TlsOptions, ) -> Result<()> { - let session = ssh_session_config(server, name, tls).await?; - - let mut ssh = ssh_base_command(&session.proxy_command); - ssh.arg("-T") - .arg("-o") - .arg("RequestTTY=no") - .arg("sandbox") - .arg(format!( - "mkdir -p {} && cat | tar xf - -C {}", - shell_escape(sandbox_path), - shell_escape(sandbox_path) - )) - .stdin(Stdio::piped()) - .stdout(Stdio::inherit()) - .stderr(Stdio::inherit()); - - let mut child = ssh.spawn().into_diagnostic()?; - let stdin = child - .stdin - .take() - .ok_or_else(|| miette::miette!("failed to open stdin for ssh process"))?; - - let local_path = local_path.to_path_buf(); - tokio::task::spawn_blocking(move || -> Result<()> { - let mut archive = tar::Builder::new(stdin); - if local_path.is_file() { - let file_name = local_path - .file_name() - .ok_or_else(|| miette::miette!("path has no file name"))?; - archive - .append_path_with_name(&local_path, file_name) - .into_diagnostic()?; - } else if local_path.is_dir() { - archive.append_dir_all(".", &local_path).into_diagnostic()?; - } else { - return Err(miette::miette!( - "local path does not exist: {}", - local_path.display() - )); + // When an explicit destination is given and looks like a file path (does + // not end with '/'), split into parent directory + target basename so that + // `mkdir -p` creates the parent and tar extracts the file with the right + // name. + // + // Exception: if splitting would yield "/" as the parent (e.g. the user + // passed "/sandbox"), fall through to directory semantics instead. The + // sandbox user cannot write to "/" and the intent is almost certainly + // "put the file inside /sandbox", not "create a file named sandbox in /". + if let Some(path) = sandbox_path { + if local_path.is_file() && !path.ends_with('/') { + let (parent, target_name) = split_sandbox_path(path); + if parent != "/" { + return ssh_tar_upload( + server, + name, + Some(parent), + UploadSource::SinglePath { + local_path: local_path.to_path_buf(), + tar_name: target_name.into(), + }, + tls, + ) + .await; + } } - archive.finish().into_diagnostic()?; - Ok(()) - }) - .await - .into_diagnostic()??; - - let status = tokio::task::spawn_blocking(move || child.wait()) - .await - .into_diagnostic()? - .into_diagnostic()?; - - if !status.success() { - return Err(miette::miette!( - "ssh tar extract exited with status {status}" - )); } - Ok(()) + let tar_name = if local_path.is_file() { + local_path + .file_name() + .ok_or_else(|| miette::miette!("path has no file name"))? + .to_os_string() + } else { + // For directories the tar_name is unused — append_dir_all uses "." + ".".into() + }; + + ssh_tar_upload( + server, + name, + sandbox_path, + UploadSource::SinglePath { + local_path: local_path.to_path_buf(), + tar_name, + }, + tls, + ) + .await } /// Pull a path from a sandbox to a local destination using tar-over-SSH. @@ -1149,4 +1228,25 @@ mod tests { assert!(message.contains("Forwarding port 3000 to sandbox demo")); assert!(message.contains("Access at: http://localhost:3000/")); } + + #[test] + fn split_sandbox_path_separates_parent_and_basename() { + assert_eq!( + split_sandbox_path("/sandbox/.bashrc"), + ("/sandbox", ".bashrc") + ); + assert_eq!( + split_sandbox_path("/sandbox/sub/file"), + ("/sandbox/sub", "file") + ); + assert_eq!(split_sandbox_path("/a/b/c/d.txt"), ("/a/b/c", "d.txt")); + } + + #[test] + fn split_sandbox_path_handles_root_and_bare_names() { + // File directly under root + assert_eq!(split_sandbox_path("/.bashrc"), ("/", ".bashrc")); + // No directory component at all + assert_eq!(split_sandbox_path("file.txt"), (".", "file.txt")); + } } From e83784900939aeeb679dfd30395c1487c8905087 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 2 Apr 2026 09:50:10 -0700 Subject: [PATCH 36/45] feat(bootstrap): resume gateway from existing state and persist SSH handshake secret (#488) --- .github/workflows/e2e-test.yml | 42 ++- crates/openshell-bootstrap/src/constants.rs | 2 + crates/openshell-bootstrap/src/docker.rs | 156 +++++++- crates/openshell-bootstrap/src/lib.rs | 160 +++++++-- crates/openshell-bootstrap/src/metadata.rs | 83 +++++ crates/openshell-bootstrap/src/runtime.rs | 140 ++++++-- crates/openshell-cli/src/bootstrap.rs | 105 +++--- crates/openshell-cli/src/run.rs | 82 +++-- crates/openshell-cli/src/ssh.rs | 57 ++- crates/openshell-sandbox/src/secrets.rs | 2 +- deploy/docker/cluster-entrypoint.sh | 11 +- deploy/docker/cluster-healthcheck.sh | 12 + .../helm/openshell/templates/statefulset.yaml | 5 +- deploy/helm/openshell/values.yaml | 8 +- .../kube/manifests/openshell-helmchart.yaml | 1 - e2e/rust/tests/gateway_resume.rs | 337 ++++++++++++++++++ rust-toolchain.toml | 5 + tasks/scripts/cluster-deploy-fast.sh | 14 +- tasks/test.toml | 3 +- 19 files changed, 1030 insertions(+), 195 deletions(-) create mode 100644 e2e/rust/tests/gateway_resume.rs create mode 100644 rust-toolchain.toml diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index f14ccb880..a89f4508f 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -19,9 +19,25 @@ permissions: jobs: e2e: - name: E2E + name: "E2E (${{ matrix.suite }})" runs-on: ${{ inputs.runner }} timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + include: + - suite: python + cluster: e2e-python + port: "8080" + cmd: "mise run --no-prepare --skip-deps e2e:python" + - suite: rust + cluster: e2e-rust + port: "8081" + cmd: "mise run --no-prepare --skip-deps e2e:rust" + - suite: gateway-resume + cluster: e2e-resume + port: "8082" + cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume" container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -38,6 +54,7 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: ${{ matrix.cluster }} steps: - uses: actions/checkout@v4 @@ -48,21 +65,26 @@ jobs: run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - name: Install Python dependencies and generate protobuf stubs + if: matrix.suite == 'python' run: uv sync --frozen && mise run --no-prepare python:proto - - name: Bootstrap and deploy cluster + - name: Build Rust CLI + if: matrix.suite != 'python' + run: cargo build -p openshell-cli --features openshell-core/dev-settings + + - name: Install SSH client + if: matrix.suite != 'python' + run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* + + - name: Bootstrap cluster env: GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: "8080" + GATEWAY_PORT: ${{ matrix.port }} + CLUSTER_NAME: ${{ matrix.cluster }} SKIP_IMAGE_PUSH: "1" SKIP_CLUSTER_IMAGE_BUILD: "1" OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} run: mise run --no-prepare --skip-deps cluster - - name: Install SSH client for Rust CLI e2e tests - run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* - - - name: Run E2E tests - run: | - mise run --no-prepare --skip-deps e2e:python - mise run --no-prepare --skip-deps e2e:rust + - name: Run tests + run: ${{ matrix.cmd }} diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index ff283b3ea..74e381fd2 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls"; pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; /// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes). pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; +/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). +pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; pub fn container_name(name: &str) -> String { format!("openshell-cluster-{name}") diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index cc63aacce..d9aaed7f4 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -8,8 +8,9 @@ use bollard::API_DEFAULT_VERSION; use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::{ - ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum, - NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest, + ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum, + NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, + RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest, }; use bollard::query_parameters::{ CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, @@ -466,6 +467,9 @@ pub async fn ensure_image( Ok(()) } +/// Returns the actual host port the container is using. When an existing +/// container is reused (same image), this may differ from `gateway_port` +/// because the container was originally created with a different port. pub async fn ensure_container( docker: &Docker, name: &str, @@ -478,7 +482,7 @@ pub async fn ensure_container( registry_username: Option<&str>, registry_token: Option<&str>, device_ids: &[String], -) -> Result<()> { +) -> Result { let container_name = container_name(name); // Check if the container already exists @@ -505,10 +509,37 @@ pub async fn ensure_container( }; if image_matches { - return Ok(()); + // The container exists with the correct image, but its network + // attachment may be stale. When the gateway is resumed after a + // container kill, `ensure_network` destroys and recreates the + // Docker network (giving it a new ID). The stopped container + // still references the old network ID, so `docker start` would + // fail with "network not found". + // + // Fix: disconnect from any existing networks and reconnect to + // the current (just-created) network before returning. + let expected_net = network_name(name); + reconcile_container_network(docker, &container_name, &expected_net).await?; + + // Read the actual host port from the container's port bindings + // as a cross-check. The caller should already pass the correct + // port (from stored metadata), but this catches mismatches if + // the container was recreated with a different port externally. + let actual_port = info + .host_config + .as_ref() + .and_then(|hc| hc.port_bindings.as_ref()) + .and_then(|pb| pb.get("30051/tcp")) + .and_then(|bindings| bindings.as_ref()) + .and_then(|bindings| bindings.first()) + .and_then(|b| b.host_port.as_ref()) + .and_then(|p| p.parse::().ok()) + .unwrap_or(gateway_port); + + return Ok(actual_port); } - // Image changed — remove the stale container so we can recreate it + // Image changed — remove the stale container so we can recreate it. tracing::info!( "Container {} exists but uses a different image (container={}, desired={}), recreating", container_name, @@ -555,6 +586,12 @@ pub async fn ensure_container( port_bindings: Some(port_bindings), binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), network_mode: Some(network_name(name)), + // Automatically restart the container when Docker restarts, unless the + // user explicitly stopped it with `gateway stop`. + restart_policy: Some(RestartPolicy { + name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), + maximum_retry_count: None, + }), // Add host gateway aliases for DNS resolution. // This allows both the entrypoint script and the running gateway // process to reach services on the Docker host. @@ -734,7 +771,7 @@ pub async fn ensure_container( .await .into_diagnostic() .wrap_err("failed to create gateway container")?; - Ok(()) + Ok(gateway_port) } /// Information about a container that is holding a port we need. @@ -956,6 +993,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() Ok(()) } +/// Clean up the gateway container and network, preserving the persistent volume. +/// +/// Used when a resume attempt fails — we want to remove the container we may +/// have just created but keep the volume so the user can retry without losing +/// their k3s/etcd state and sandbox data. +pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> { + let container_name = container_name(name); + let net_name = network_name(name); + + // Disconnect container from network + let _ = docker + .disconnect_network( + &net_name, + NetworkDisconnectRequest { + container: container_name.clone(), + force: Some(true), + }, + ) + .await; + + let _ = stop_container(docker, &container_name).await; + + let remove_container = docker + .remove_container( + &container_name, + Some(RemoveContainerOptions { + force: true, + ..Default::default() + }), + ) + .await; + if let Err(err) = remove_container + && !is_not_found(&err) + { + return Err(err).into_diagnostic(); + } + + force_remove_network(docker, &net_name).await?; + + Ok(()) +} + /// Forcefully remove a Docker network, disconnecting any remaining /// containers first. This ensures that stale Docker network endpoints /// cannot prevent port bindings from being released. @@ -993,6 +1072,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> { } } +/// Ensure a stopped container is connected to the expected Docker network. +/// +/// When a gateway is resumed after the container was killed (but not removed), +/// `ensure_network` destroys and recreates the network with a new ID. The +/// stopped container still holds a reference to the old network ID in its +/// config, so `docker start` would fail with a 404 "network not found" error. +/// +/// This function disconnects the container from any networks that no longer +/// match the expected network name and connects it to the correct one. +async fn reconcile_container_network( + docker: &Docker, + container_name: &str, + expected_network: &str, +) -> Result<()> { + let info = docker + .inspect_container(container_name, None::) + .await + .into_diagnostic() + .wrap_err("failed to inspect container for network reconciliation")?; + + // Check the container's current network attachments via NetworkSettings. + let attached_networks: Vec = info + .network_settings + .as_ref() + .and_then(|ns| ns.networks.as_ref()) + .map(|nets| nets.keys().cloned().collect()) + .unwrap_or_default(); + + // If the container is already attached to the expected network (by name), + // Docker will resolve the name to the current network ID on start. + // However, when the network was destroyed and recreated, the container's + // stored endpoint references the old ID. Disconnect and reconnect to + // pick up the new network ID. + for net_name in &attached_networks { + let _ = docker + .disconnect_network( + net_name, + NetworkDisconnectRequest { + container: container_name.to_string(), + force: Some(true), + }, + ) + .await; + } + + // Connect to the (freshly created) expected network. + docker + .connect_network( + expected_network, + NetworkConnectRequest { + container: container_name.to_string(), + endpoint_config: Some(EndpointSettings::default()), + }, + ) + .await + .into_diagnostic() + .wrap_err("failed to connect container to gateway network")?; + + tracing::debug!( + "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}" + ); + + Ok(()) +} + fn is_not_found(err: &BollardError) -> bool { matches!( err, diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 7dcabe052..b569cabe7 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -26,13 +26,13 @@ use miette::{IntoDiagnostic, Result}; use std::sync::{Arc, Mutex}; use crate::constants::{ - CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name, - volume_name, + CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, + SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name, }; use crate::docker::{ - check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container, - ensure_image, ensure_network, ensure_volume, resolve_gpu_device_ids, start_container, - stop_container, + check_existing_gateway, check_port_conflicts, cleanup_gateway_container, + destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume, + resolve_gpu_device_ids, start_container, stop_container, }; use crate::metadata::{ create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, @@ -308,19 +308,22 @@ where .and_then(|info| info.cdi_spec_dirs) .is_some_and(|dirs| !dirs.is_empty()); - // If an existing gateway is found, either tear it down (when recreate is - // requested) or bail out so the caller can prompt the user / reuse it. + // If an existing gateway is found, decide how to proceed: + // - recreate: destroy everything and start fresh + // - otherwise: auto-resume from existing state (the ensure_* calls are + // idempotent and will reuse the volume, create a container if needed, + // and start it) + let mut resume = false; if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); destroy_gateway_resources(&target_docker, &name).await?; + } else if existing.container_running { + log("[status] Gateway is already running".to_string()); + resume = true; } else { - return Err(miette::miette!( - "Gateway '{name}' already exists (container_running={}).\n\ - Use --recreate to destroy and redeploy, or destroy it first with:\n\n \ - openshell gateway destroy --name {name}", - existing.container_running, - )); + log("[status] Resuming gateway from existing state".to_string()); + resume = true; } } @@ -426,7 +429,10 @@ where // See: https://github.com/NVIDIA/OpenShell/issues/463 let deploy_result: Result = async { let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported); - ensure_container( + // ensure_container returns the actual host port — which may differ from + // the requested `port` when reusing an existing container that was + // originally created with a different port. + let actual_port = ensure_container( &target_docker, &name, &image_ref, @@ -440,16 +446,22 @@ where &device_ids, ) .await?; + let port = actual_port; start_container(&target_docker, &name).await?; // Clean up stale k3s nodes left over from previous container instances that - // used the same persistent volume. Without this, pods remain scheduled on + // used the same persistent volume. Without this, pods remain scheduled on // NotReady ghost nodes and the health check will time out. + // + // The function retries internally until kubectl becomes available (k3s may + // still be initialising after the container start). It also force-deletes + // pods stuck in Terminating on the removed nodes so that StatefulSets can + // reschedule replacements immediately. match clean_stale_nodes(&target_docker, &name).await { Ok(0) => {} - Ok(n) => tracing::debug!("removed {n} stale node(s)"), + Ok(n) => tracing::info!("removed {n} stale node(s) and their orphaned pods"), Err(err) => { - tracing::debug!("stale node cleanup failed (non-fatal): {err}"); + tracing::warn!("stale node cleanup failed (non-fatal): {err}"); } } @@ -476,6 +488,11 @@ where store_pki_bundle(&name, &pki_bundle)?; + // Reconcile SSH handshake secret: reuse existing K8s secret if present, + // generate and persist a new one otherwise. This secret is stored in etcd + // (on the persistent volume) so it survives container restarts. + reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?; + // Push locally-built component images into the k3s containerd runtime. // This is the "push" path for local development — images are exported from // the local Docker daemon and streamed into the cluster's containerd so @@ -545,15 +562,30 @@ where docker: target_docker, }), Err(deploy_err) => { - // Automatically clean up Docker resources (volume, container, network, - // image) so the environment is left in a retryable state. - tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); - if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { - tracing::warn!( - "automatic cleanup after failed deploy also failed: {cleanup_err}. \ - Manual cleanup may be required: \ - openshell gateway destroy --name {name}" + if resume { + // When resuming, preserve the volume so the user can retry. + // Only clean up the container and network that we may have created. + tracing::info!( + "resume failed, cleaning up container for '{name}' (preserving volume)" ); + if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed resume also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } + } else { + // Automatically clean up Docker resources (volume, container, network, + // image) so the environment is left in a retryable state. + tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); + if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed deploy also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } } Err(deploy_err) } @@ -830,6 +862,14 @@ where let cname = container_name(name); let kubeconfig = constants::KUBECONFIG_PATH; + // Wait for the k3s API server and openshell namespace before attempting + // to read secrets. Without this, kubectl fails transiently on resume + // (k3s hasn't booted yet), the code assumes secrets are gone, and + // regenerates PKI unnecessarily — triggering a server rollout restart + // and TLS errors for in-flight connections. + log("[progress] Waiting for openshell namespace".to_string()); + wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; + // Try to load existing secrets. match load_existing_pki_bundle(docker, &cname, kubeconfig).await { Ok(bundle) => { @@ -844,10 +884,6 @@ where } // Generate fresh PKI and apply to cluster. - // Namespace may still be creating on first bootstrap, so wait here only - // when rotation is actually needed. - log("[progress] Waiting for openshell namespace".to_string()); - wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; log("[progress] Generating TLS certificates".to_string()); let bundle = generate_pki(extra_sans)?; log("[progress] Applying TLS secrets to gateway".to_string()); @@ -858,6 +894,72 @@ where Ok((bundle, true)) } +/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret. +/// +/// If the secret already exists in the cluster, this is a no-op. Otherwise a +/// fresh 32-byte hex secret is generated and applied. Because the secret lives +/// in etcd (backed by the persistent Docker volume), it survives container +/// restarts without regeneration — existing sandbox SSH sessions remain valid. +async fn reconcile_ssh_handshake_secret(docker: &Docker, name: &str, log: &F) -> Result<()> +where + F: Fn(String) + Sync, +{ + use miette::WrapErr; + + let cname = container_name(name); + let kubeconfig = constants::KUBECONFIG_PATH; + + // Check if the secret already exists. + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null" + ), + ], + ) + .await?; + + if exit_code == 0 && !output.trim().is_empty() { + tracing::debug!( + "existing SSH handshake secret found ({} bytes encoded)", + output.trim().len() + ); + log("[progress] Reusing existing SSH handshake secret".to_string()); + return Ok(()); + } + + // Generate a new 32-byte hex secret and create the K8s secret. + log("[progress] Generating SSH handshake secret".to_string()); + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \ + KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \ + --from-literal=secret=$SECRET --dry-run=client -o yaml | \ + KUBECONFIG={kubeconfig} kubectl apply -f -" + ), + ], + ) + .await?; + + if exit_code != 0 { + return Err(miette::miette!( + "failed to create SSH handshake secret (exit {exit_code}): {output}" + )) + .wrap_err("failed to apply SSH handshake secret"); + } + + Ok(()) +} + /// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`]. /// /// Returns an error string describing why secrets couldn't be loaded (for logging). diff --git a/crates/openshell-bootstrap/src/metadata.rs b/crates/openshell-bootstrap/src/metadata.rs index 15f79c089..20680f4c0 100644 --- a/crates/openshell-bootstrap/src/metadata.rs +++ b/crates/openshell-bootstrap/src/metadata.rs @@ -47,6 +47,34 @@ pub struct GatewayMetadata { pub edge_auth_url: Option, } +impl GatewayMetadata { + /// Extract the host portion from the stored `gateway_endpoint` URL. + /// + /// Returns `None` if the endpoint is malformed or uses a default loopback + /// address (`127.0.0.1`, `localhost`, `::1`) — those are never meaningful + /// as a `--gateway-host` override. + pub fn gateway_host(&self) -> Option<&str> { + // Endpoint format: "https://host:port" or "http://host:port" + let after_scheme = self + .gateway_endpoint + .strip_prefix("https://") + .or_else(|| self.gateway_endpoint.strip_prefix("http://"))?; + // Strip port suffix (":8082") + let host = after_scheme + .rsplit_once(':') + .map_or(after_scheme, |(h, _)| h); + if host.is_empty() + || host == "127.0.0.1" + || host == "localhost" + || host == "::1" + || host == "[::1]" + { + return None; + } + Some(host) + } +} + pub fn create_gateway_metadata( name: &str, remote: Option<&RemoteOptions>, @@ -500,6 +528,61 @@ mod tests { assert_eq!(meta.gateway_endpoint, "http://host.docker.internal:8080"); } + // ── GatewayMetadata::gateway_host() ────────────────────────────── + + #[test] + fn gateway_host_returns_custom_host() { + let meta = + create_gateway_metadata_with_host("t", None, 8082, Some("host.docker.internal"), false); + assert_eq!(meta.gateway_host(), Some("host.docker.internal")); + } + + #[test] + fn gateway_host_returns_none_for_loopback() { + let meta = create_gateway_metadata("t", None, 8080); + // Default endpoint is https://127.0.0.1:8080 + assert_eq!(meta.gateway_host(), None); + } + + #[test] + fn gateway_host_returns_none_for_localhost() { + let meta = GatewayMetadata { + name: "t".into(), + gateway_endpoint: "https://localhost:8080".into(), + is_remote: false, + gateway_port: 8080, + remote_host: None, + resolved_host: None, + auth_mode: None, + edge_team_domain: None, + edge_auth_url: None, + }; + assert_eq!(meta.gateway_host(), None); + } + + #[test] + fn gateway_host_returns_ip_for_remote() { + let meta = GatewayMetadata { + name: "t".into(), + gateway_endpoint: "https://10.0.0.5:8080".into(), + is_remote: true, + gateway_port: 8080, + remote_host: Some("user@10.0.0.5".into()), + resolved_host: Some("10.0.0.5".into()), + auth_mode: None, + edge_team_domain: None, + edge_auth_url: None, + }; + assert_eq!(meta.gateway_host(), Some("10.0.0.5")); + } + + #[test] + fn gateway_host_handles_http_scheme() { + let meta = + create_gateway_metadata_with_host("t", None, 8080, Some("host.docker.internal"), true); + assert_eq!(meta.gateway_host(), Some("host.docker.internal")); + } + #[test] fn remote_gateway_metadata_with_tls_disabled() { let opts = RemoteOptions::new("user@10.0.0.5"); diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 271fde8d4..2a10b2651 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -362,57 +362,135 @@ pub async fn fetch_recent_logs(docker: &Docker, container_name: &str, n: usize) rendered } -/// Remove stale k3s nodes from a cluster with a reused persistent volume. +/// Remove stale k3s nodes and their orphaned pods from a resumed cluster. /// /// When a cluster container is recreated but the volume is reused, k3s registers /// a new node (using the container ID as the hostname) while old node entries /// persist in etcd. Pods scheduled on those stale `NotReady` nodes will never run, /// causing health checks to fail. /// -/// This function identifies all `NotReady` nodes and deletes them so k3s can -/// reschedule workloads onto the current (Ready) node. +/// This function retries with backoff until `kubectl` becomes available (k3s may +/// still be initialising), then: +/// 1. Deletes all `NotReady` nodes so k3s stops tracking them. +/// 2. Force-deletes any pods stuck in `Terminating` so `StatefulSets` and +/// Deployments can reschedule replacements on the current (Ready) node. /// /// Returns the number of stale nodes removed. pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { + // Retry until kubectl is responsive. k3s can take 10-20 s to start the + // API server after a container restart, so we allow up to ~45 s. + const MAX_ATTEMPTS: u32 = 15; + const RETRY_DELAY: Duration = Duration::from_secs(3); + let container_name = container_name(name); + let mut stale_nodes: Vec = Vec::new(); + + for attempt in 1..=MAX_ATTEMPTS { + // List ALL node names and the container's own hostname. Any node that + // is not the current container is stale — we cannot rely on the Ready + // condition because k3s may not have marked the old node NotReady yet + // when this runs shortly after container start. + let (output, exit_code) = exec_capture_with_exit( + docker, + &container_name, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \ + --no-headers -o custom-columns=NAME:.metadata.name \ + 2>/dev/null" + ), + ], + ) + .await?; + + if exit_code == 0 { + // Determine the current node name (container hostname). + let (hostname_out, _) = + exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) + .await?; + let current_hostname = hostname_out.trim().to_string(); + + stale_nodes = output + .lines() + .map(str::trim) + .filter(|l| !l.is_empty() && *l != current_hostname) + .map(ToString::to_string) + .collect(); + break; + } + + if attempt < MAX_ATTEMPTS { + tracing::debug!( + "kubectl not ready yet (attempt {attempt}/{MAX_ATTEMPTS}), retrying in {}s", + RETRY_DELAY.as_secs() + ); + tokio::time::sleep(RETRY_DELAY).await; + } + } + + if stale_nodes.is_empty() { + return Ok(0); + } + + let node_list = stale_nodes.join(" "); + let count = stale_nodes.len(); + tracing::info!("removing {} stale node(s): {}", count, node_list); - // Get the list of NotReady nodes. - // The last condition on a node is always type=Ready; we need to check its - // **status** (True/False/Unknown), not its type. Nodes where the Ready - // condition status is not "True" are stale and should be removed. - let (output, exit_code) = exec_capture_with_exit( + // Step 1: delete the stale node objects. + let (_output, exit_code) = exec_capture_with_exit( docker, &container_name, vec![ "sh".to_string(), "-c".to_string(), format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl get nodes \ - --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].status \ - 2>/dev/null | grep -v '\\bTrue$' | awk '{{print $1}}'" + "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found" ), ], ) .await?; if exit_code != 0 { - // kubectl not ready yet or no nodes — nothing to clean - return Ok(0); + tracing::warn!("failed to delete stale nodes (exit code {exit_code})"); } - let stale_nodes: Vec<&str> = output - .lines() - .map(str::trim) - .filter(|l| !l.is_empty()) - .collect(); - if stale_nodes.is_empty() { - return Ok(0); - } + // Step 2: force-delete pods stuck in Terminating. After the stale node is + // removed, pods that were scheduled on it transition to Terminating but + // will never complete graceful shutdown (the node is gone). StatefulSets + // will not create a replacement until the old pod is fully deleted. + let (_output, exit_code) = exec_capture_with_exit( + docker, + &container_name, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ + --field-selector=status.phase=Running -o name 2>/dev/null; \ + for pod_line in $(KUBECONFIG={KUBECONFIG_PATH} kubectl get pods --all-namespaces \ + --no-headers 2>/dev/null | awk '$4 == \"Terminating\" {{print $1\"/\"$2}}'); do \ + ns=${{pod_line%%/*}}; pod=${{pod_line#*/}}; \ + KUBECONFIG={KUBECONFIG_PATH} kubectl delete pod \"$pod\" -n \"$ns\" \ + --force --grace-period=0 --ignore-not-found 2>/dev/null; \ + done" + ), + ], + ) + .await?; - let node_list = stale_nodes.join(" "); - let count = stale_nodes.len(); - tracing::info!("removing {} stale node(s): {}", count, node_list); + if exit_code != 0 { + tracing::debug!( + "force-delete of terminating pods returned exit code {exit_code} (non-fatal)" + ); + } + // Step 3: delete PersistentVolumeClaims in the openshell namespace whose + // backing PV has node affinity for a stale node. local-path-provisioner + // creates PVs tied to the original node; when the node changes, the PV is + // unschedulable and the `StatefulSet` pod stays Pending. Deleting the PVC + // (and its PV) lets the provisioner create a fresh one on the current node. let (_output, exit_code) = exec_capture_with_exit( docker, &container_name, @@ -420,14 +498,24 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { "sh".to_string(), "-c".to_string(), format!( - "KUBECONFIG={KUBECONFIG_PATH} kubectl delete node {node_list} --ignore-not-found" + r#"KUBECONFIG={KUBECONFIG_PATH}; export KUBECONFIG; \ + CURRENT_NODE=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1); \ + [ -z "$CURRENT_NODE" ] && exit 0; \ + for pv in $(kubectl get pv -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null); do \ + NODE=$(kubectl get pv "$pv" -o jsonpath='{{.spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]}}' 2>/dev/null); \ + [ "$NODE" = "$CURRENT_NODE" ] && continue; \ + NS=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.namespace}}' 2>/dev/null); \ + PVC=$(kubectl get pv "$pv" -o jsonpath='{{.spec.claimRef.name}}' 2>/dev/null); \ + [ -n "$PVC" ] && kubectl delete pvc "$PVC" -n "$NS" --ignore-not-found 2>/dev/null; \ + kubectl delete pv "$pv" --ignore-not-found 2>/dev/null; \ + done"# ), ], ) .await?; if exit_code != 0 { - tracing::warn!("failed to delete stale nodes (exit code {exit_code})"); + tracing::debug!("PV/PVC cleanup returned exit code {exit_code} (non-fatal)"); } Ok(count) diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index ea6410b91..ee9a481aa 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -144,47 +144,62 @@ pub async fn run_bootstrap( ); eprintln!(); - // Auto-bootstrap always recreates if stale Docker resources are found - // (e.g. metadata was deleted but container/volume still exist). - let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true); - if let Some(dest) = remote { - let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); - if let Some(key) = ssh_key { - remote_opts = remote_opts.with_ssh_key(key); + // Build deploy options. The deploy flow auto-resumes from existing state + // (preserving sandboxes and secrets) when it finds an existing gateway. + // If the initial attempt fails, fall back to a full recreate. + let build_options = |recreate: bool| { + let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) + .with_recreate(recreate) + .with_gpu(if gpu { + vec!["auto".to_string()] + } else { + vec![] + }); + if let Some(dest) = remote { + let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); + if let Some(key) = ssh_key { + remote_opts = remote_opts.with_ssh_key(key); + } + opts = opts.with_remote(remote_opts); } - options = options.with_remote(remote_opts); - } - // Read registry credentials from environment for the auto-bootstrap path. - // The explicit `--registry-username` / `--registry-token` flags are only - // on `gateway start`; when bootstrapping via `sandbox create`, the env - // vars are the mechanism. - if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") - && !username.trim().is_empty() - { - options = options.with_registry_username(username); - } - if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") - && !token.trim().is_empty() - { - options = options.with_registry_token(token); - } - // Read gateway host override from environment. Needed whenever the - // client cannot reach the Docker host at 127.0.0.1 — CI containers, - // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag - // is only on `gateway start`; this env var covers the auto-bootstrap - // path triggered by `sandbox create`. - if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") - && !host.trim().is_empty() - { - options = options.with_gateway_host(host); - } - options = options.with_gpu(if gpu { - vec!["auto".to_string()] - } else { - vec![] - }); + // Read registry credentials from environment for the auto-bootstrap path. + // The explicit `--registry-username` / `--registry-token` flags are only + // on `gateway start`; when bootstrapping via `sandbox create`, the env + // vars are the mechanism. + if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") + && !username.trim().is_empty() + { + opts = opts.with_registry_username(username); + } + if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") + && !token.trim().is_empty() + { + opts = opts.with_registry_token(token); + } + // Read gateway host override from environment. Needed whenever the + // client cannot reach the Docker host at 127.0.0.1 — CI containers, + // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag + // is only on `gateway start`; this env var covers the auto-bootstrap + // path triggered by `sandbox create`. + if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") + && !host.trim().is_empty() + { + opts = opts.with_gateway_host(host); + } + opts + }; - let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; + // Deploy the gateway. The deploy flow auto-resumes from existing state + // when it finds one. If that fails, fall back to a full recreate. + let handle = match deploy_gateway_with_panel(build_options(false), &gateway_name, location) + .await + { + Ok(handle) => handle, + Err(resume_err) => { + tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}"); + deploy_gateway_with_panel(build_options(true), &gateway_name, location).await? + } + }; let server = handle.gateway_endpoint().to_string(); print_deploy_summary(&gateway_name, &handle); @@ -210,9 +225,13 @@ pub async fn run_bootstrap( /// Retry connecting to the gateway gRPC endpoint until it succeeds or a /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling -/// up to 4 s, with a total deadline of 30 s. -async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { - const MAX_WAIT: Duration = Duration::from_secs(30); +/// up to 4 s, with a total deadline of 90 s. +/// +/// The generous timeout accounts for gateway resume scenarios where stale k3s +/// nodes must be cleaned up and workload pods rescheduled before the gRPC +/// endpoint becomes available. +pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { + const MAX_WAIT: Duration = Duration::from_secs(90); const INITIAL_BACKOFF: Duration = Duration::from_millis(500); let start = std::time::Instant::now(); @@ -236,7 +255,7 @@ async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { Err(last_err .unwrap_or_else(|| miette::miette!("timed out waiting for gateway")) - .wrap_err("gateway deployed but not accepting connections after 30 s")) + .wrap_err("gateway deployed but not accepting connections after 90 s")) } #[cfg(test)] diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index b3ec05383..2c06ab948 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1369,62 +1369,51 @@ pub async fn gateway_admin_deploy( opts }); - // Check whether a gateway already exists. If so, prompt the user (unless - // --recreate was passed or we're in non-interactive mode). - let mut should_recreate = recreate; - if let Some(existing) = - openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? - { - if !should_recreate { - let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal(); - if interactive { - let status = if existing.container_running { - "running" - } else if existing.container_exists { - "stopped" - } else { - "volume only" - }; - eprintln!(); + // If the gateway is already running and we're not recreating, short-circuit. + if !recreate { + if let Some(existing) = + openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? + { + if existing.container_running { eprintln!( - "{} Gateway '{name}' already exists ({status}).", - "!".yellow().bold() + "{} Gateway '{name}' is already running.", + "✓".green().bold() ); - if let Some(image) = &existing.container_image { - eprintln!(" {} {}", "Image:".dimmed(), image); - } - eprintln!(); - eprint!("Destroy and recreate? [y/N] "); - std::io::stderr().flush().ok(); - let mut input = String::new(); - std::io::stdin() - .read_line(&mut input) - .into_diagnostic() - .wrap_err("failed to read user input")?; - let choice = input.trim().to_lowercase(); - should_recreate = choice == "y" || choice == "yes"; - if !should_recreate { - eprintln!("Keeping existing gateway."); - return Ok(()); - } - } else { - // Non-interactive mode: reuse existing gateway silently. - eprintln!("Gateway '{name}' already exists, reusing."); return Ok(()); } } } + // When resuming an existing gateway (not recreating), prefer the port + // and gateway host from stored metadata over the CLI defaults. The user + // may have originally bootstrapped on a non-default port (e.g. `--port + // 8082`) or with `--gateway-host host.docker.internal`, and a bare + // `gateway start` without those flags should honour the original values. + let stored_metadata = if !recreate { + openshell_bootstrap::load_gateway_metadata(name).ok() + } else { + None + }; + let effective_port = stored_metadata + .as_ref() + .filter(|m| m.gateway_port > 0) + .map_or(port, |m| m.gateway_port); + let effective_gateway_host: Option = gateway_host.map(String::from).or_else(|| { + stored_metadata + .as_ref() + .and_then(|m| m.gateway_host().map(String::from)) + }); + let mut options = DeployOptions::new(name) - .with_port(port) + .with_port(effective_port) .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) - .with_recreate(should_recreate); + .with_recreate(recreate); if let Some(opts) = remote_opts { options = options.with_remote(opts); } - if let Some(host) = gateway_host { + if let Some(host) = effective_gateway_host { options = options.with_gateway_host(host); } if let Some(username) = registry_username { @@ -1436,6 +1425,15 @@ pub async fn gateway_admin_deploy( let handle = deploy_gateway_with_panel(options, name, location).await?; + // Wait for the gRPC endpoint to actually accept connections before + // declaring the gateway ready. The Docker health check may pass before + // the gRPC listener inside the pod is fully bound. + let server = handle.gateway_endpoint().to_string(); + let tls = TlsOptions::default() + .with_gateway_name(name) + .with_default_paths(&server); + crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?; + print_deploy_summary(name, &handle); // Auto-activate: set this gateway as the active gateway. diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index 79fb64fb7..ebcbbeb4f 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -772,27 +772,50 @@ pub async fn sandbox_ssh_proxy( .ok_or_else(|| miette::miette!("gateway URL missing port"))?; let connect_path = url.path(); - let mut stream: Box = - connect_gateway(scheme, gateway_host, gateway_port, tls).await?; - let request = format!( "CONNECT {connect_path} HTTP/1.1\r\nHost: {gateway_host}\r\nX-Sandbox-Id: {sandbox_id}\r\nX-Sandbox-Token: {token}\r\n\r\n" ); - stream - .write_all(request.as_bytes()) - .await - .into_diagnostic()?; - // Wrap in a BufReader **before** reading the HTTP response. The gateway - // may send the 200 OK response and the first SSH protocol bytes in the - // same TCP segment / WebSocket frame. A plain `read()` would consume - // those SSH bytes into our buffer and discard them, causing SSH to see a - // truncated protocol banner and exit with code 255. BufReader ensures - // any bytes read past the `\r\n\r\n` header boundary stay buffered and - // are returned by subsequent reads during the bidirectional copy phase. - let mut buf_stream = BufReader::new(stream); - let status = read_connect_status(&mut buf_stream).await?; - if status != 200 { + // The gateway returns 412 (Precondition Failed) when the sandbox pod + // exists but hasn't reached Ready phase yet. This is a transient state + // after sandbox allocation — retry with backoff instead of failing + // immediately. + const MAX_CONNECT_WAIT: Duration = Duration::from_secs(60); + const INITIAL_BACKOFF: Duration = Duration::from_secs(1); + + let start = std::time::Instant::now(); + let mut backoff = INITIAL_BACKOFF; + let mut buf_stream; + + loop { + let mut stream: Box = + connect_gateway(scheme, gateway_host, gateway_port, tls).await?; + stream + .write_all(request.as_bytes()) + .await + .into_diagnostic()?; + + // Wrap in a BufReader **before** reading the HTTP response. The gateway + // may send the 200 OK response and the first SSH protocol bytes in the + // same TCP segment / WebSocket frame. A plain `read()` would consume + // those SSH bytes into our buffer and discard them, causing SSH to see a + // truncated protocol banner and exit with code 255. BufReader ensures + // any bytes read past the `\r\n\r\n` header boundary stay buffered and + // are returned by subsequent reads during the bidirectional copy phase. + buf_stream = BufReader::new(stream); + let status = read_connect_status(&mut buf_stream).await?; + if status == 200 { + break; + } + if status == 412 && start.elapsed() < MAX_CONNECT_WAIT { + tracing::debug!( + elapsed = ?start.elapsed(), + "sandbox not yet ready (HTTP 412), retrying in {backoff:?}" + ); + tokio::time::sleep(backoff).await; + backoff = (backoff * 2).min(Duration::from_secs(8)); + continue; + } return Err(miette::miette!( "gateway CONNECT failed with status {status}" )); diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs index 88b84831e..a27537c91 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-sandbox/src/secrets.rs @@ -62,7 +62,7 @@ pub(crate) struct RewriteTargetResult { // --------------------------------------------------------------------------- #[derive(Debug, Clone, Default)] -pub(crate) struct SecretResolver { +pub struct SecretResolver { by_placeholder: HashMap, } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index d4717d88e..367665db3 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -457,10 +457,10 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART" fi -# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between -# the gateway and sandbox SSH servers. This is required — the server will refuse -# to start without it. -SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')}" +# SSH handshake secret: previously generated here and injected via sed into the +# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake) +# created by the bootstrap process after k3s starts. This ensures the secret +# survives container restarts without regeneration. # Inject SSH gateway host/port into the HelmChart manifest so the openshell # server returns the correct address to CLI clients for SSH proxy CONNECT. @@ -479,9 +479,6 @@ if [ -f "$HELMCHART" ]; then # Clear the placeholder so the default (8080) is used sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART" fi - echo "Setting SSH handshake secret" - sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" - # Disable gateway auth: when set, the server accepts connections without # client certificates (for reverse-proxy / Cloudflare Tunnel deployments). if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index 68210b456..e2828c6e5 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -68,3 +68,15 @@ if [ "${DISABLE_TLS:-}" != "true" ]; then kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1 kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1 fi + +# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets) +kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1 + +# --------------------------------------------------------------------------- +# Verify the gateway NodePort (30051) is actually accepting TCP connections. +# After a container restart, kube-proxy may need extra time to re-program +# iptables rules for NodePort routing. Without this check the health check +# can pass before the port is routable, causing "Connection refused" on the +# host-mapped port. +# --------------------------------------------------------------------------- +timeout 2 bash -c 'echo >/dev/tcp/127.0.0.1/30051' 2>/dev/null || exit 1 diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 1be8f14ab..ed503a78e 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -77,7 +77,10 @@ spec: value: {{ .Values.server.hostGatewayIP | quote }} {{- end }} - name: OPENSHELL_SSH_HANDSHAKE_SECRET - value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }} + valueFrom: + secretKeyRef: + name: {{ .Values.server.sshHandshakeSecretName | quote }} + key: secret {{- if .Values.server.disableTls }} - name: OPENSHELL_DISABLE_TLS value: "true" diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index ccc8d1ffa..d698e8120 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -85,10 +85,10 @@ server: sshGatewayPort: 0 # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. - # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH. - # Required — the server will refuse to start if empty. For cluster deployments - # this is auto-generated by the entrypoint script. - sshHandshakeSecret: "" + # Name of the Kubernetes Secret holding the NSSH1 HMAC handshake key. + # The secret must contain a `secret` key with the hex-encoded HMAC key. + # For cluster deployments this is auto-created by the bootstrap process. + sshHandshakeSecretName: "openshell-ssh-handshake" # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get # hostAliases entries mapping host.docker.internal and host.openshell.internal # to this IP, allowing them to reach services running on the Docker host. diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 2245c72ed..ae22ddc6a 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -32,7 +32,6 @@ spec: sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ - sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080" hostGatewayIP: __HOST_GATEWAY_IP__ disableGatewayAuth: __DISABLE_GATEWAY_AUTH__ diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs new file mode 100644 index 000000000..01ad4941e --- /dev/null +++ b/e2e/rust/tests/gateway_resume.rs @@ -0,0 +1,337 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! E2E tests for gateway resume from existing state. +//! +//! All scenarios run inside a **single** `#[tokio::test]` so they execute +//! in a deterministic order and share a known-good gateway state. Each +//! scenario restores the gateway to a healthy state before the next one +//! begins, preventing cascading failures. +//! +//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. + +use std::process::{Command, Stdio}; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::strip_ansi; +use tokio::time::sleep; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Resolve the gateway name from the `OPENSHELL_GATEWAY` env var (the same +/// variable the CLI reads), falling back to `"openshell"` which matches CI. +fn gateway_name() -> String { + std::env::var("OPENSHELL_GATEWAY").unwrap_or_else(|_| "openshell".to_string()) +} + +/// Docker container name for the e2e gateway. +fn container_name() -> String { + format!("openshell-cluster-{}", gateway_name()) +} + +/// Run `openshell ` and return (combined output, exit code). +async fn run_cli(args: &[&str]) -> (String, i32) { + let mut cmd = openshell_cmd(); + cmd.args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code) +} + +/// Run `docker ` synchronously and return (stdout, exit code). +fn docker_cmd(args: &[&str]) -> (String, i32) { + let output = Command::new("docker") + .args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .expect("spawn docker"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let code = output.status.code().unwrap_or(-1); + (stdout, code) +} + +/// Wait for the gateway to become healthy by polling `openshell status`. +async fn wait_for_healthy(timeout: Duration) { + let start = std::time::Instant::now(); + loop { + let (output, code) = run_cli(&["status"]).await; + let clean = strip_ansi(&output).to_lowercase(); + if code == 0 + && (clean.contains("healthy") + || clean.contains("running") + || clean.contains("connected") + || clean.contains("✓")) + { + return; + } + if start.elapsed() > timeout { + panic!( + "gateway did not become healthy within {}s. Last output:\n{}", + timeout.as_secs(), + strip_ansi(&output) + ); + } + sleep(Duration::from_secs(3)).await; + } +} + +/// Read the SSH handshake secret from the K8s secret inside the cluster. +fn read_ssh_handshake_secret() -> Option { + let cname = container_name(); + let (output, code) = docker_cmd(&[ + "exec", + &cname, + "sh", + "-c", + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null", + ]); + if code == 0 && !output.trim().is_empty() { + Some(output.trim().to_string()) + } else { + None + } +} + +/// Extract the sandbox name from `openshell sandbox create` output. +fn extract_sandbox_name(output: &str) -> String { + strip_ansi(output) + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output") +} + +/// Run `gateway start` and log the output if it fails (non-fatal — the +/// test relies on [`wait_for_healthy`] for the real assertion). +async fn start_gateway() { + let (output, code) = run_cli(&["gateway", "start"]).await; + if code != 0 { + eprintln!( + "gateway start exited {code} (may still recover):\n{}", + strip_ansi(&output) + ); + } +} + +// --------------------------------------------------------------------------- +// Orchestrated test suite +// --------------------------------------------------------------------------- + +/// Single entry-point that runs every resume scenario in a fixed order. +/// +/// Running as one `#[tokio::test]` gives us: +/// - **Deterministic ordering** — no async-mutex races. +/// - **Cascade prevention** — each scenario starts only after the previous +/// one left the gateway healthy. +/// - **No task-runner hacks** — no `--test-threads`, `--skip`, or split +/// cargo invocations. +#[tokio::test] +async fn gateway_resume_scenarios() { + // The gateway must already be running (bootstrapped by the `cluster` task). + wait_for_healthy(Duration::from_secs(30)).await; + + // Warm the sandbox base image by creating (and deleting) a throwaway + // sandbox. On a fresh cluster the ~1 GB image pull can take minutes; + // doing it once up-front keeps the actual scenarios snappy. + eprintln!("--- warmup: pulling sandbox base image ---"); + let (output, code) = + run_cli(&["sandbox", "create", "--", "echo", "warmup"]).await; + if code == 0 { + let name = extract_sandbox_name(&output); + let _ = run_cli(&["sandbox", "delete", &name]).await; + } else { + eprintln!( + "warmup sandbox create failed (non-fatal, image may already be cached):\n{}", + strip_ansi(&output) + ); + } + + scenario_start_on_running_gateway().await; + scenario_ssh_secret_persists_across_restart().await; + scenario_stop_start_resumes_with_sandbox().await; + scenario_container_kill_resumes().await; + scenario_container_removal_resumes().await; +} + +// --------------------------------------------------------------------------- +// Scenario: `gateway start` on an already-running gateway +// --------------------------------------------------------------------------- + +async fn scenario_start_on_running_gateway() { + eprintln!("--- scenario: start on running gateway ---"); + + let (output, code) = run_cli(&["gateway", "start"]).await; + let clean = strip_ansi(&output); + + assert_eq!( + code, 0, + "gateway start on running gateway should exit 0:\n{clean}" + ); + assert!( + clean.to_lowercase().contains("already running"), + "output should indicate gateway is already running:\n{clean}" + ); +} + +// --------------------------------------------------------------------------- +// Scenario: SSH handshake secret persists across restart +// --------------------------------------------------------------------------- + +async fn scenario_ssh_secret_persists_across_restart() { + eprintln!("--- scenario: SSH secret persists across restart ---"); + + let secret_before = + read_ssh_handshake_secret().expect("SSH handshake secret should exist before restart"); + assert!( + !secret_before.is_empty(), + "SSH handshake secret should not be empty" + ); + + // Stop → start. + let (_, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!(stop_code, 0, "gateway stop should succeed"); + sleep(Duration::from_secs(3)).await; + + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; + + let secret_after = + read_ssh_handshake_secret().expect("SSH handshake secret should exist after restart"); + assert_eq!( + secret_before, secret_after, + "SSH handshake secret should be identical before and after restart" + ); +} + +// --------------------------------------------------------------------------- +// Scenario: stop → start resumes, sandbox survives +// --------------------------------------------------------------------------- + +async fn scenario_stop_start_resumes_with_sandbox() { + eprintln!("--- scenario: stop/start resumes with sandbox ---"); + + // Create a sandbox. + let (output, code) = + run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; + assert_eq!( + code, 0, + "sandbox create should succeed:\n{}", + strip_ansi(&output) + ); + let sandbox_name = extract_sandbox_name(&output); + + // Stop → start. + let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!( + stop_code, 0, + "gateway stop should succeed:\n{}", + strip_ansi(&stop_output) + ); + sleep(Duration::from_secs(3)).await; + + // Verify container is stopped. + let (inspect_out, _) = docker_cmd(&[ + "inspect", + "-f", + "{{.State.Running}}", + &container_name(), + ]); + assert_eq!( + inspect_out.trim(), + "false", + "container should be stopped after gateway stop" + ); + + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; + + // Verify sandbox survived. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_name), + "sandbox '{sandbox_name}' should survive stop/start.\nList:\n{clean_list}" + ); + + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; +} + +// --------------------------------------------------------------------------- +// Scenario: container killed → resume with stale network +// --------------------------------------------------------------------------- + +async fn scenario_container_kill_resumes() { + eprintln!("--- scenario: container kill resumes ---"); + + let cname = container_name(); + let net_name = format!("openshell-cluster-{}", gateway_name()); + + // Kill the container. + let (_, kill_code) = docker_cmd(&["kill", &cname]); + assert_eq!(kill_code, 0, "docker kill should succeed"); + sleep(Duration::from_secs(3)).await; + + // Remove the network to simulate a stale network reference. + // The bootstrap `ensure_network` always destroys and recreates, so + // after this the container's stored network ID will be invalid. + let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]); + let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]); + assert_eq!( + net_rm_code, 0, + "docker network rm should succeed" + ); + + // Resume — must handle stale network + reuse existing PKI. + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; +} + +// --------------------------------------------------------------------------- +// Scenario: container removed → resume from volume +// --------------------------------------------------------------------------- + +async fn scenario_container_removal_resumes() { + eprintln!("--- scenario: container removal resumes ---"); + + // Force-remove the container. + let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); + assert_eq!(rm_code, 0, "docker rm -f should succeed"); + + // Volume should survive. + let (vol_out, vol_code) = docker_cmd(&[ + "volume", + "inspect", + &format!("openshell-cluster-{}", gateway_name()), + ]); + assert_eq!( + vol_code, 0, + "volume should still exist after container removal:\n{vol_out}" + ); + + // Resume from volume. + start_gateway().await; + wait_for_healthy(Duration::from_secs(300)).await; +} diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 000000000..25f96ab68 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[toolchain] +channel = "stable" diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 600bdd6c7..307e76233 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -408,12 +408,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then # terminates mTLS (there is no server.tls.enabled toggle). Without this, # a prior Helm override or chart default change could silently regress # sandbox callbacks to plaintext. - # Retrieve the existing handshake secret from the running release, or generate - # a new one if this is the first deploy with the mandatory secret. - EXISTING_SECRET=$(cluster_exec "helm get values openshell -n openshell -o json 2>/dev/null \ - | grep -o '\"sshHandshakeSecret\":\"[^\"]*\"' \ - | cut -d'\"' -f4") || true - SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}" + # Ensure the SSH handshake K8s secret exists. The bootstrap process normally + # creates it, but fast-deploy may run before bootstrap on a fresh cluster. + EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true + if [ -z "${EXISTING_SECRET}" ]; then + SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" + cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -" + fi # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so # that hostAliases for host.openshell.internal are preserved across fast deploys. @@ -433,7 +434,6 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.certSecretName=openshell-server-tls \ --set server.tls.clientCaSecretName=openshell-server-client-ca \ --set server.tls.clientTlsSecretName=openshell-client-tls \ - --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ ${HOST_GATEWAY_ARGS} \ ${helm_wait_args}" helm_end=$(date +%s) diff --git a/tasks/test.toml b/tasks/test.toml index 6231c21e7..c383eafb5 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -32,7 +32,8 @@ description = "Run Rust CLI e2e tests (requires a running cluster)" depends = ["cluster"] run = [ "cargo build -p openshell-cli --features openshell-core/dev-settings", - "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e", + # gateway_resume tests run in a dedicated CI job with their own cluster. + "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip gateway_resume_scenarios", ] ["e2e:python"] From dd8dd8a60cceab3ef9f71c7f04cca13289d35462 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:50:36 -0700 Subject: [PATCH 37/45] fix(security): bump container dependencies to remediate 10 CVEs (#736) - k3s v1.35.2-k3s1 -> v1.35.3-k3s1 (containerd v2.2.2, runc v1.4.1, Go 1.25.7) - Docker CLI 29.3.0 -> 29.3.1 (Go 1.25.8, containerd v2.2.2) - syft 1.42.2 -> 1.42.3 (bumps buger/jsonparser) - Explicit gpgv and python3 upgrades in all container images Addresses: GHSA-p77j-4mvh-x3m3 (Critical), GHSA-pwhc-rpq9-4c8w, GHSA-p436-gjf2-799p, GHSA-9h8m-3fm2-qjrq, GHSA-6v2p-p543-phr9, GHSA-6g7g-w4f8-9c9x, GHSA-4qg8-fj49-pxjh, CVE-2026-4519, CVE-2025-68973, CVE-2024-36623 Closes #735 --- deploy/docker/Dockerfile.ci | 3 ++- deploy/docker/Dockerfile.images | 9 ++++++--- mise.toml | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/deploy/docker/Dockerfile.ci b/deploy/docker/Dockerfile.ci index 4ab01cebe..b87962b7e 100644 --- a/deploy/docker/Dockerfile.ci +++ b/deploy/docker/Dockerfile.ci @@ -8,7 +8,7 @@ FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 -ARG DOCKER_VERSION=29.3.0 +ARG DOCKER_VERSION=29.3.1 ARG BUILDX_VERSION=v0.32.1 ARG TARGETARCH @@ -34,6 +34,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ xz-utils \ jq \ rsync \ + && apt-get install -y --only-upgrade gpgv python3 \ && rm -rf /var/lib/apt/lists/* # Install Docker CLI and buildx plugin used by CI jobs diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index af17b9b0a..d078429df 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -15,8 +15,8 @@ # Pin by tag AND manifest-list digest to prevent silent upstream republishes # from breaking the build. Update both when bumping k3s versions. # To refresh: docker buildx imagetools inspect rancher/k3s: | head -3 -ARG K3S_VERSION=v1.35.2-k3s1 -ARG K3S_DIGEST=sha256:c3184157c3048112bab0c3e17405991da486cb3413511eba23f7650efd70776b +ARG K3S_VERSION=v1.35.3-k3s1 +ARG K3S_DIGEST=sha256:4607083d3cac07e1ccde7317297271d13ed5f60f35a78f33fcef84858a9f1d69 ARG K9S_VERSION=v0.50.18 ARG HELM_VERSION=v3.17.3 ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1 @@ -165,7 +165,9 @@ COPY --from=supervisor-builder /build/out/openshell-sandbox /openshell-sandbox FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 AS gateway RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates && rm -rf /var/lib/apt/lists/* + ca-certificates && \ + apt-get install -y --only-upgrade gpgv && \ + rm -rf /var/lib/apt/lists/* RUN useradd --create-home --user-group openshell @@ -230,6 +232,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ iptables \ mount \ dnsutils \ + && apt-get install -y --only-upgrade gpgv \ && rm -rf /var/lib/apt/lists/* COPY --from=k3s /bin/ /bin/ diff --git a/mise.toml b/mise.toml index d204f5316..4bcb4e072 100644 --- a/mise.toml +++ b/mise.toml @@ -20,7 +20,7 @@ uv = "0.10.2" protoc = "29.6" helm = "4.1.1" "ubi:mozilla/sccache" = { version = "0.14.0", matching = "sccache-v" } -"ubi:anchore/syft" = { version = "1.42.2", matching = "syft_" } +"ubi:anchore/syft" = { version = "1.42.3", matching = "syft_" } "ubi:EmbarkStudios/cargo-about" = "0.8.4" [env] From b56f8308c5ae7cb2de7d67c7b7825f8ef41744d8 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:54:59 -0700 Subject: [PATCH 38/45] fix(security): update OSS dependencies to remediate 3 high-severity CVEs (#737) - tar 0.4.44 -> 0.4.45 (CVE-2026-33055: PAX size header skip) - aws-lc-rs 1.16.1 -> 1.16.2 / aws-lc-sys 0.38.0 -> 0.39.1 (BDSA-2026-5232: name constraints bypass in certificate validation) - Pygments 2.19.2 -> 2.20.0 (BDSA-2026-5113 / CVE-2026-4539: catastrophic regex backtracking) --- Cargo.lock | 26 +++++++++++++------------- uv.lock | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f827bc88..7d20c9bd8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,7 +125,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -136,7 +136,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.16.1" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ "aws-lc-sys", "untrusted 0.7.1", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.38.0" +version = "0.39.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" +checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" dependencies = [ "cc", "cmake", @@ -1283,7 +1283,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -2691,7 +2691,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4044,7 +4044,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4519,7 +4519,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -4902,9 +4902,9 @@ dependencies = [ [[package]] name = "tar" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" dependencies = [ "filetime", "libc", @@ -4930,7 +4930,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix 1.1.4", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] diff --git a/uv.lock b/uv.lock index 687a035ae..38a03ce29 100644 --- a/uv.lock +++ b/uv.lock @@ -637,11 +637,11 @@ wheels = [ [[package]] name = "pygments" -version = "2.19.2" +version = "2.20.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] [[package]] From 8887d7c66ac6589617e70776e2c7e2104a9a81e6 Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:30:51 -0700 Subject: [PATCH 39/45] fix(sandbox): harden seccomp filter to block dangerous syscalls (#740) --- architecture/sandbox.md | 46 ++++- architecture/security-policy.md | 29 ++- .../src/sandbox/linux/seccomp.rs | 182 ++++++++++++++++++ 3 files changed, 248 insertions(+), 9 deletions(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index c870708dd..c5e212f85 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -24,7 +24,7 @@ All paths are relative to `crates/openshell-sandbox/src/`. | `sandbox/mod.rs` | Platform abstraction -- dispatches to Linux or no-op | | `sandbox/linux/mod.rs` | Linux composition: Landlock then seccomp | | `sandbox/linux/landlock.rs` | Filesystem isolation via Landlock LSM (ABI V1) | -| `sandbox/linux/seccomp.rs` | Syscall filtering via BPF on `SYS_socket` | +| `sandbox/linux/seccomp.rs` | Syscall filtering via BPF: socket domain blocks, dangerous syscall blocks, conditional flag blocks | | `bypass_monitor.rs` | Background `/dev/kmsg` reader for iptables bypass detection events | | `sandbox/linux/netns.rs` | Network namespace creation, veth pair setup, bypass detection iptables rules, cleanup on drop | | `l7/mod.rs` | L7 types (`L7Protocol`, `TlsMode`, `EnforcementMode`, `L7EndpointConfig`), config parsing, validation, access preset expansion, deprecated `tls` value handling | @@ -451,13 +451,7 @@ Kernel-level error behavior (e.g., Landlock ABI unavailable) depends on `Landloc **File:** `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs` -Seccomp blocks socket creation for specific address families. The filter targets a single syscall (`SYS_socket`) and inspects argument 0 (the domain). - -**Always blocked** (regardless of network mode): -- `AF_NETLINK`, `AF_PACKET`, `AF_BLUETOOTH`, `AF_VSOCK` - -**Additionally blocked in `Block` mode** (no proxy): -- `AF_INET`, `AF_INET6` +Seccomp provides three layers of syscall restriction: socket domain blocks, unconditional syscall blocks, and conditional syscall blocks. The filter uses a default-allow policy (`SeccompAction::Allow`) with targeted rules that return `Errno(EPERM)`. **Skipped entirely** in `Allow` mode. @@ -465,8 +459,44 @@ Setup: 1. `prctl(PR_SET_NO_NEW_PRIVS, 1)` -- required before seccomp 2. `seccompiler::apply_filter()` with default action `Allow` and per-rule action `Errno(EPERM)` +#### Socket domain blocks + +| Domain | Always blocked | Additionally blocked in Block mode | +|--------|:-:|:-:| +| `AF_PACKET` | Yes | | +| `AF_BLUETOOTH` | Yes | | +| `AF_VSOCK` | Yes | | +| `AF_INET` | | Yes | +| `AF_INET6` | | Yes | +| `AF_NETLINK` | | Yes | + In `Proxy` mode, `AF_INET`/`AF_INET6` are allowed because the sandboxed process needs to connect to the proxy over the veth pair. The network namespace ensures it can only reach the proxy's IP (`10.200.0.1`). +#### Unconditional syscall blocks + +These syscalls are blocked entirely (EPERM for any invocation): + +| Syscall | Reason | +|---------|--------| +| `memfd_create` | Fileless binary execution bypasses Landlock filesystem restrictions | +| `ptrace` | Cross-process memory inspection and code injection | +| `bpf` | Kernel BPF program loading | +| `process_vm_readv` | Cross-process memory read | +| `io_uring_setup` | Async I/O subsystem with extensive CVE history | +| `mount` | Filesystem mount could subvert Landlock or overlay writable paths | + +#### Conditional syscall blocks + +These syscalls are only blocked when specific flag patterns are present: + +| Syscall | Condition | Reason | +|---------|-----------|--------| +| `execveat` | `AT_EMPTY_PATH` flag set (arg4) | Fileless execution from an anonymous fd | +| `unshare` | `CLONE_NEWUSER` flag set (arg0) | User namespace creation enables privilege escalation | +| `seccomp` | operation == `SECCOMP_SET_MODE_FILTER` (arg0) | Prevents sandboxed code from replacing the active filter | + +Conditional blocks use `MaskedEq` for flag checks (bit-test) and `Eq` for exact-value matches. This allows normal use of these syscalls while blocking the dangerous flag combinations. + ### Network namespace isolation **File:** `crates/openshell-sandbox/src/sandbox/linux/netns.rs` diff --git a/architecture/security-policy.md b/architecture/security-policy.md index 555ba67a5..01eb96f94 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -850,6 +850,10 @@ The response includes an `X-OpenShell-Policy` header and `Connection: close`. Se ## Seccomp Filter Details +The seccomp filter uses a default-allow policy (`SeccompAction::Allow`) with targeted rules that return `EPERM`. It provides three layers of protection: socket domain blocks, unconditional syscall blocks, and conditional syscall blocks. See `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs`. + +### Blocked socket domains + Regardless of network mode, certain socket domains are always blocked: | Domain | Constant | Reason | @@ -861,7 +865,30 @@ Regardless of network mode, certain socket domains are always blocked: In proxy mode (which is always active), `AF_INET` (2) and `AF_INET6` (10) are allowed so the sandbox process can reach the proxy. -The seccomp filter uses a default-allow policy (`SeccompAction::Allow`) with specific `socket()` syscall rules that return `EPERM` when the first argument (domain) matches a blocked value. See `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs`. +### Blocked syscalls + +These syscalls are blocked unconditionally (EPERM for any invocation): + +| Syscall | NR (x86-64) | Reason | +|---------|-------------|--------| +| `memfd_create` | 319 | Fileless binary execution bypasses Landlock filesystem restrictions | +| `ptrace` | 101 | Cross-process memory inspection and code injection | +| `bpf` | 321 | Kernel BPF program loading | +| `process_vm_readv` | 310 | Cross-process memory read | +| `io_uring_setup` | 425 | Async I/O subsystem with extensive CVE history | +| `mount` | 165 | Filesystem mount could subvert Landlock or overlay writable paths | + +### Conditionally blocked syscalls + +These syscalls are blocked only when specific flag patterns are present in their arguments: + +| Syscall | NR (x86-64) | Condition | Reason | +|---------|-------------|-----------|--------| +| `execveat` | 322 | `AT_EMPTY_PATH` (0x1000) set in flags (arg4) | Fileless execution from an anonymous fd | +| `unshare` | 272 | `CLONE_NEWUSER` (0x10000000) set in flags (arg0) | User namespace creation enables privilege escalation | +| `seccomp` | 317 | operation == `SECCOMP_SET_MODE_FILTER` (1) in arg0 | Prevents sandboxed code from replacing the active filter | + +Flag checks use `MaskedEq` (`(arg & mask) == mask`) to detect the flag bit regardless of other bits. The `seccomp` syscall check uses `Eq` for exact value comparison on the operation argument. --- diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index 6c9d8307b..e23447498 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -2,6 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 //! Seccomp syscall filtering. +//! +//! The filter uses a default-allow policy with targeted blocks: +//! +//! 1. **Socket domain blocks** -- prevent raw/kernel sockets that bypass the proxy +//! 2. **Unconditional syscall blocks** -- block syscalls that enable sandbox escape +//! (fileless exec, ptrace, BPF, cross-process memory access, io_uring, mount) +//! 3. **Conditional syscall blocks** -- block dangerous flag combinations on otherwise +//! needed syscalls (execveat+AT_EMPTY_PATH, unshare+CLONE_NEWUSER, +//! seccomp+SET_MODE_FILTER) use crate::policy::{NetworkMode, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; @@ -13,6 +22,9 @@ use std::collections::BTreeMap; use std::convert::TryInto; use tracing::debug; +/// Value of `SECCOMP_SET_MODE_FILTER` (linux/seccomp.h). +const SECCOMP_SET_MODE_FILTER: u64 = 1; + pub fn apply(policy: &SandboxPolicy) -> Result<()> { if matches!(policy.network.mode, NetworkMode::Allow) { return Ok(()); @@ -37,6 +49,7 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> { fn build_filter(allow_inet: bool) -> Result { let mut rules: BTreeMap> = BTreeMap::new(); + // --- Socket domain blocks --- let mut blocked_domains = vec![libc::AF_PACKET, libc::AF_BLUETOOTH, libc::AF_VSOCK]; if !allow_inet { blocked_domains.push(libc::AF_INET); @@ -49,6 +62,51 @@ fn build_filter(allow_inet: bool) -> Result { add_socket_domain_rule(&mut rules, domain)?; } + // --- Unconditional syscall blocks --- + // These syscalls are blocked entirely (empty rule vec = unconditional EPERM). + + // Fileless binary execution via memfd bypasses Landlock filesystem restrictions. + rules.entry(libc::SYS_memfd_create).or_default(); + // Cross-process memory inspection and code injection. + rules.entry(libc::SYS_ptrace).or_default(); + // Kernel BPF program loading. + rules.entry(libc::SYS_bpf).or_default(); + // Cross-process memory read. + rules.entry(libc::SYS_process_vm_readv).or_default(); + // Async I/O subsystem with extensive CVE history. + rules.entry(libc::SYS_io_uring_setup).or_default(); + // Filesystem mount could subvert Landlock or overlay writable paths. + rules.entry(libc::SYS_mount).or_default(); + + // --- Conditional syscall blocks --- + + // execveat with AT_EMPTY_PATH enables fileless execution from an anonymous fd. + add_masked_arg_rule( + &mut rules, + libc::SYS_execveat, + 4, // flags argument + libc::AT_EMPTY_PATH as u64, + )?; + + // unshare with CLONE_NEWUSER allows creating user namespaces to escalate privileges. + add_masked_arg_rule( + &mut rules, + libc::SYS_unshare, + 0, // flags argument + libc::CLONE_NEWUSER as u64, + )?; + + // seccomp(SECCOMP_SET_MODE_FILTER) would let sandboxed code replace the active filter. + let condition = SeccompCondition::new( + 0, // operation argument + SeccompCmpArgLen::Dword, + SeccompCmpOp::Eq, + SECCOMP_SET_MODE_FILTER, + ) + .into_diagnostic()?; + let rule = SeccompRule::new(vec![condition]).into_diagnostic()?; + rules.entry(libc::SYS_seccomp).or_default().push(rule); + let arch = std::env::consts::ARCH .try_into() .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?; @@ -74,3 +132,127 @@ fn add_socket_domain_rule(rules: &mut BTreeMap>, domain: i rules.entry(libc::SYS_socket).or_default().push(rule); Ok(()) } + +/// Block a syscall when a specific bit pattern is set in an argument. +/// +/// Uses `MaskedEq` to check `(arg & flag_bit) == flag_bit`, which triggers +/// EPERM when the flag is present regardless of other bits in the argument. +fn add_masked_arg_rule( + rules: &mut BTreeMap>, + syscall: i64, + arg_index: u8, + flag_bit: u64, +) -> Result<()> { + let condition = SeccompCondition::new( + arg_index, + SeccompCmpArgLen::Dword, + SeccompCmpOp::MaskedEq(flag_bit), + flag_bit, + ) + .into_diagnostic()?; + let rule = SeccompRule::new(vec![condition]).into_diagnostic()?; + rules.entry(syscall).or_default().push(rule); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_filter_proxy_mode_compiles() { + let filter = build_filter(true); + assert!(filter.is_ok(), "build_filter(true) should succeed"); + } + + #[test] + fn build_filter_block_mode_compiles() { + let filter = build_filter(false); + assert!(filter.is_ok(), "build_filter(false) should succeed"); + } + + #[test] + fn add_masked_arg_rule_creates_entry() { + let mut rules: BTreeMap> = BTreeMap::new(); + let result = add_masked_arg_rule(&mut rules, libc::SYS_execveat, 4, 0x1000); + assert!(result.is_ok()); + assert!( + rules.contains_key(&libc::SYS_execveat), + "should have an entry for SYS_execveat" + ); + assert_eq!( + rules[&libc::SYS_execveat].len(), + 1, + "should have exactly one rule" + ); + } + + #[test] + fn unconditional_blocks_present_in_filter() { + let mut rules: BTreeMap> = BTreeMap::new(); + + // Simulate what build_filter does for unconditional blocks + rules.entry(libc::SYS_memfd_create).or_default(); + rules.entry(libc::SYS_ptrace).or_default(); + rules.entry(libc::SYS_bpf).or_default(); + rules.entry(libc::SYS_process_vm_readv).or_default(); + rules.entry(libc::SYS_io_uring_setup).or_default(); + rules.entry(libc::SYS_mount).or_default(); + + // Unconditional blocks have an empty Vec (no conditions = always match) + for syscall in [ + libc::SYS_memfd_create, + libc::SYS_ptrace, + libc::SYS_bpf, + libc::SYS_process_vm_readv, + libc::SYS_io_uring_setup, + libc::SYS_mount, + ] { + assert!( + rules.contains_key(&syscall), + "syscall {syscall} should be in the rules map" + ); + assert!( + rules[&syscall].is_empty(), + "syscall {syscall} should have empty rules (unconditional block)" + ); + } + } + + #[test] + fn conditional_blocks_have_rules() { + // Build a real filter and verify the conditional syscalls have rule entries + // (non-empty Vec means conditional match) + let mut rules: BTreeMap> = BTreeMap::new(); + + add_masked_arg_rule( + &mut rules, + libc::SYS_execveat, + 4, + libc::AT_EMPTY_PATH as u64, + ) + .unwrap(); + add_masked_arg_rule(&mut rules, libc::SYS_unshare, 0, libc::CLONE_NEWUSER as u64).unwrap(); + + let condition = SeccompCondition::new( + 0, + SeccompCmpArgLen::Dword, + SeccompCmpOp::Eq, + SECCOMP_SET_MODE_FILTER, + ) + .unwrap(); + let rule = SeccompRule::new(vec![condition]).unwrap(); + rules.entry(libc::SYS_seccomp).or_default().push(rule); + + for syscall in [libc::SYS_execveat, libc::SYS_unshare, libc::SYS_seccomp] { + assert!( + rules.contains_key(&syscall), + "syscall {syscall} should be in the rules map" + ); + assert!( + !rules[&syscall].is_empty(), + "syscall {syscall} should have conditional rules" + ); + } + } +} From 77e55ea989d144b8761875a6c566d9289dac460b Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 15:06:32 -0700 Subject: [PATCH 40/45] test(e2e): replace flaky Python live policy update tests with Rust (#742) Remove test_live_policy_update_and_logs and test_live_policy_update_from_empty_network_policies from the Python e2e suite. Both used a manual 90s poll loop against GetSandboxPolicyStatus that flaked in CI with 'Policy v2 was not loaded within 90s'. Add e2e/rust/tests/live_policy_update.rs with two replacement tests that exercise the same policy lifecycle (version bumping, hash idempotency, policy list history) through the CLI using the built-in --wait flag for reliable synchronization. --- e2e/python/test_sandbox_policy.py | 260 +--------------- e2e/rust/tests/live_policy_update.rs | 423 +++++++++++++++++++++++++++ 2 files changed, 424 insertions(+), 259 deletions(-) create mode 100644 e2e/rust/tests/live_policy_update.rs diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index 625fe8da0..092f99784 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -314,9 +314,7 @@ def log_message(self, *args): {"connect_status": connect_resp.strip(), "http_status": 0} ) - request = ( - f"{method} {path} HTTP/1.1\r\nHost: {target_host}\r\nConnection: close\r\n\r\n" - ) + request = f"{method} {path} HTTP/1.1\r\nHost: {target_host}\r\nConnection: close\r\n\r\n" conn.sendall(request.encode()) data = b"" @@ -1348,262 +1346,6 @@ def test_l7_rule_without_query_matcher_allows_any_query_params( assert "connect-server-ok" in resp["body"] -# ============================================================================= -# Live policy update + log streaming tests -# -# LPU-1: Create sandbox, verify initial policy is v1 -# LPU-2: Set the same policy again -> unchanged (no new version) -# LPU-3: Push a different policy -> new version loaded, verify connectivity -# LPU-4: Push v2 again -> unchanged -# LPU-5: Fetch logs (one-shot + streaming) and verify both sources appear -# ============================================================================= - - -def test_live_policy_update_and_logs( - sandbox: Callable[..., Sandbox], - sandbox_client: SandboxClient, -) -> None: - """End-to-end: live policy update lifecycle with log verification.""" - from openshell._proto import openshell_pb2, sandbox_pb2 - - # --- Setup: two distinct policies --- - # Policy A: python can reach api.anthropic.com - policy_a = _base_policy( - network_policies={ - "anthropic": sandbox_pb2.NetworkPolicyRule( - name="anthropic", - endpoints=[ - sandbox_pb2.NetworkEndpoint(host="api.anthropic.com", port=443), - ], - binaries=[sandbox_pb2.NetworkBinary(path="/**")], - ), - }, - ) - # Policy B: python can reach api.anthropic.com AND example.com - policy_b = _base_policy( - network_policies={ - "anthropic": sandbox_pb2.NetworkPolicyRule( - name="anthropic", - endpoints=[ - sandbox_pb2.NetworkEndpoint(host="api.anthropic.com", port=443), - ], - binaries=[sandbox_pb2.NetworkBinary(path="/**")], - ), - "example": sandbox_pb2.NetworkPolicyRule( - name="example", - endpoints=[ - sandbox_pb2.NetworkEndpoint(host="example.com", port=443), - ], - binaries=[sandbox_pb2.NetworkBinary(path="/**")], - ), - }, - ) - - spec = datamodel_pb2.SandboxSpec(policy=policy_a) - stub = sandbox_client._stub - - with sandbox(spec=spec, delete_on_exit=True) as sb: - sandbox_name = sb.sandbox.name - - # --- LPU-1: Initial policy should be version 1 --- - status_resp = stub.GetSandboxPolicyStatus( - openshell_pb2.GetSandboxPolicyStatusRequest(name=sandbox_name, version=0) - ) - assert status_resp.revision.version >= 1, "Initial policy should be at least v1" - initial_version = status_resp.revision.version - initial_hash = status_resp.revision.policy_hash - - # --- LPU-2: Set the same policy -> no new version --- - update_resp = stub.UpdateConfig( - openshell_pb2.UpdateConfigRequest( - name=sandbox_name, - policy=policy_a, - ) - ) - assert update_resp.version == initial_version, ( - f"Same policy should return existing version {initial_version}, " - f"got {update_resp.version}" - ) - assert update_resp.policy_hash == initial_hash - - # --- LPU-3: Push policy B -> new version --- - update_resp = stub.UpdateConfig( - openshell_pb2.UpdateConfigRequest( - name=sandbox_name, - policy=policy_b, - ) - ) - new_version = update_resp.version - assert new_version > initial_version, ( - f"Different policy should create new version > {initial_version}, " - f"got {new_version}" - ) - assert update_resp.policy_hash != initial_hash - - # Wait for the sandbox to load the new policy (poll loop is 30s default). - import time - - deadline = time.time() + 90 - loaded = False - while time.time() < deadline: - status_resp = stub.GetSandboxPolicyStatus( - openshell_pb2.GetSandboxPolicyStatusRequest( - name=sandbox_name, version=new_version - ) - ) - status = status_resp.revision.status - if status == openshell_pb2.POLICY_STATUS_LOADED: - loaded = True - break - if status == openshell_pb2.POLICY_STATUS_FAILED: - pytest.fail( - f"Policy v{new_version} failed to load: " - f"{status_resp.revision.load_error}" - ) - time.sleep(2) - assert loaded, f"Policy v{new_version} was not loaded within 90s" - - # Verify the new policy works: example.com should now be allowed - result = sb.exec_python(_proxy_connect(), args=("example.com", 443)) - assert result.exit_code == 0, result.stderr - assert "200" in result.stdout, ( - f"example.com should be allowed after policy update, got: {result.stdout}" - ) - - # --- LPU-4: Push policy B again -> unchanged --- - update_resp = stub.UpdateConfig( - openshell_pb2.UpdateConfigRequest( - name=sandbox_name, - policy=policy_b, - ) - ) - assert update_resp.version == new_version, ( - f"Same policy B should return existing version {new_version}, " - f"got {update_resp.version}" - ) - - # --- LPU-5: Verify policy history --- - list_resp = stub.ListSandboxPolicies( - openshell_pb2.ListSandboxPoliciesRequest(name=sandbox_name, limit=10) - ) - versions = [r.version for r in list_resp.revisions] - assert new_version in versions - assert initial_version in versions - - # Only one version should be Loaded - loaded_count = sum( - 1 - for r in list_resp.revisions - if r.status == openshell_pb2.POLICY_STATUS_LOADED - ) - assert loaded_count == 1, ( - f"Expected exactly 1 loaded version, got {loaded_count}: " - f"{[(r.version, r.status) for r in list_resp.revisions]}" - ) - - # --- LPU-6: Fetch logs (one-shot) and verify both sources --- - # Resolve sandbox ID for log RPCs - get_resp = stub.GetSandbox(openshell_pb2.GetSandboxRequest(name=sandbox_name)) - sandbox_id = get_resp.sandbox.id - - logs_resp = stub.GetSandboxLogs( - openshell_pb2.GetSandboxLogsRequest(sandbox_id=sandbox_id, lines=500) - ) - assert logs_resp.buffer_total > 0, "Expected some logs in the buffer" - - sources = {log.source or "gateway" for log in logs_resp.logs} - assert "gateway" in sources, ( - f"Expected gateway logs in response, got sources: {sources}" - ) - # Sandbox logs may take a moment to arrive via the push stream. - # If they're present, verify the source tag. - if "sandbox" in sources: - sandbox_logs = [l for l in logs_resp.logs if l.source == "sandbox"] - assert len(sandbox_logs) > 0 - # Verify structured fields are present on at least one sandbox log - has_fields = any(len(l.fields) > 0 for l in sandbox_logs) - # Not all sandbox logs have fields (e.g., "Starting sandbox" doesn't), - # so we just check at least one does if there are CONNECT logs - connect_logs = [l for l in sandbox_logs if "CONNECT" in l.message] - if connect_logs: - assert has_fields, "CONNECT logs should have structured fields" - - -def test_live_policy_update_from_empty_network_policies( - sandbox: Callable[..., Sandbox], - sandbox_client: SandboxClient, -) -> None: - """End-to-end: add the first network rule to a running sandbox.""" - from openshell._proto import openshell_pb2, sandbox_pb2 - - initial_policy = _base_policy() - updated_policy = _base_policy( - network_policies={ - "example": sandbox_pb2.NetworkPolicyRule( - name="example", - endpoints=[ - sandbox_pb2.NetworkEndpoint(host="example.com", port=443), - ], - binaries=[sandbox_pb2.NetworkBinary(path="/**")], - ), - }, - ) - - spec = datamodel_pb2.SandboxSpec(policy=initial_policy) - stub = sandbox_client._stub - - with sandbox(spec=spec, delete_on_exit=True) as sb: - sandbox_name = sb.sandbox.name - - denied = sb.exec_python(_proxy_connect(), args=("example.com", 443)) - assert denied.exit_code == 0, denied.stderr - assert "403" in denied.stdout, denied.stdout - - initial_status = stub.GetSandboxPolicyStatus( - openshell_pb2.GetSandboxPolicyStatusRequest(name=sandbox_name, version=0) - ) - initial_version = initial_status.revision.version - - update_resp = stub.UpdateConfig( - openshell_pb2.UpdateConfigRequest( - name=sandbox_name, - policy=updated_policy, - ) - ) - new_version = update_resp.version - assert new_version > initial_version, ( - f"Adding the first network rule should create a new version > {initial_version}, " - f"got {new_version}" - ) - - import time - - deadline = time.time() + 90 - loaded = False - while time.time() < deadline: - status_resp = stub.GetSandboxPolicyStatus( - openshell_pb2.GetSandboxPolicyStatusRequest( - name=sandbox_name, version=new_version - ) - ) - status = status_resp.revision.status - if status == openshell_pb2.POLICY_STATUS_LOADED: - loaded = True - break - if status == openshell_pb2.POLICY_STATUS_FAILED: - pytest.fail( - f"Policy v{new_version} failed to load: " - f"{status_resp.revision.load_error}" - ) - time.sleep(2) - - assert loaded, f"Policy v{new_version} was not loaded within 90s" - - allowed = sb.exec_python(_proxy_connect(), args=("example.com", 443)) - assert allowed.exit_code == 0, allowed.stderr - assert "200" in allowed.stdout, allowed.stdout - - # ============================================================================= # Forward proxy tests (plain HTTP, non-CONNECT) # ============================================================================= diff --git a/e2e/rust/tests/live_policy_update.rs b/e2e/rust/tests/live_policy_update.rs new file mode 100644 index 000000000..c60b29548 --- /dev/null +++ b/e2e/rust/tests/live_policy_update.rs @@ -0,0 +1,423 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! E2E tests for live policy updates on a running sandbox. +//! +//! Covers the full round-trip: +//! - Create sandbox with policy A +//! - Verify initial policy version via `policy get` +//! - Push same policy A again -> no version bump (idempotent) +//! - Push different policy B -> new version, `--wait` for sandbox to load it +//! - Verify policy history via `policy list` +//! +//! These tests replace the Python e2e tests `test_live_policy_update_and_logs` +//! and `test_live_policy_update_from_empty_network_policies`, which were flaky +//! due to hard-coded 90s poll timeouts. The Rust tests use the CLI's built-in +//! `--wait` flag for reliable synchronization. +//! +//! Note: the removed Python tests also covered `GetSandboxLogs` RPC and +//! verified actual proxy connectivity after policy update. Those are tracked +//! as follow-up coverage gaps -- the proxy enforcement path is covered by the +//! existing L4/L7/SSRF Python e2e tests, and log fetching needs a dedicated +//! test. + +#![cfg(feature = "e2e")] + +use std::fmt::Write as _; +use std::io::Write; +use std::process::Stdio; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::{extract_field, strip_ansi}; +use openshell_e2e::harness::sandbox::SandboxGuard; +use tempfile::NamedTempFile; + +// --------------------------------------------------------------------------- +// Policy YAML builders +// --------------------------------------------------------------------------- + +/// Build a policy YAML that allows any binary to reach the given hosts on +/// port 443. +/// +/// NOTE: The indentation in the format string is load-bearing YAML structure. +fn write_policy(hosts: &[&str]) -> Result { + let mut file = NamedTempFile::new().map_err(|e| format!("create temp policy file: {e}"))?; + + let mut network_rules = String::new(); + for (i, host) in hosts.iter().enumerate() { + let _ = write!( + network_rules, + r#" rule_{i}: + name: rule_{i} + endpoints: + - host: {host} + port: 443 + binaries: + - path: "/**" +"# + ); + } + + let policy = format!( + r"version: 1 + +filesystem_policy: + include_workdir: true + read_only: + - /usr + - /lib + - /proc + - /dev/urandom + - /app + - /etc + - /var/log + read_write: + - /sandbox + - /tmp + - /dev/null + +landlock: + compatibility: best_effort + +process: + run_as_user: sandbox + run_as_group: sandbox + +network_policies: +{network_rules}" + ); + + file.write_all(policy.as_bytes()) + .map_err(|e| format!("write temp policy file: {e}"))?; + file.flush() + .map_err(|e| format!("flush temp policy file: {e}"))?; + Ok(file) +} + +/// Build a minimal policy YAML with no network rules. +fn write_empty_network_policy() -> Result { + let mut file = NamedTempFile::new().map_err(|e| format!("create temp policy file: {e}"))?; + + let policy = r"version: 1 + +filesystem_policy: + include_workdir: true + read_only: + - /usr + - /lib + - /proc + - /dev/urandom + - /app + - /etc + - /var/log + read_write: + - /sandbox + - /tmp + - /dev/null + +landlock: + compatibility: best_effort + +process: + run_as_user: sandbox + run_as_group: sandbox +"; + + file.write_all(policy.as_bytes()) + .map_err(|e| format!("write temp policy file: {e}"))?; + file.flush() + .map_err(|e| format!("flush temp policy file: {e}"))?; + Ok(file) +} + +// --------------------------------------------------------------------------- +// CLI helpers +// --------------------------------------------------------------------------- + +struct CliResult { + success: bool, + output: String, + exit_code: Option, +} + +/// Run an `openshell` CLI command and return the result. +async fn run_cli(args: &[&str]) -> CliResult { + let mut cmd = openshell_cmd(); + cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell command"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = strip_ansi(&format!("{stdout}{stderr}")); + + CliResult { + success: output.status.success(), + output: combined, + exit_code: output.status.code(), + } +} + +/// Extract the policy version number from `policy get` output. +/// +/// Uses the shared `extract_field` helper to find `Version: ` or +/// `Revision: ` in CLI tabular output. +fn extract_version(output: &str) -> Option { + extract_field(output, "Version") + .or_else(|| extract_field(output, "Revision")) + .and_then(|v| v.parse::().ok()) +} + +/// Extract the policy hash from `policy get` output. +fn extract_hash(output: &str) -> Option { + extract_field(output, "Hash") + .or_else(|| extract_field(output, "Policy hash")) +} + +/// Check that a version number appears in `policy list` output as a +/// distinct field value (not just a substring of some other number). +/// +/// Looks for the version number preceded by whitespace or at the start +/// of a line, to avoid matching "2" inside "12" or timestamps. +fn list_output_contains_version(output: &str, version: u32) -> bool { + let v = version.to_string(); + output.lines().any(|line| { + line.split_whitespace() + .any(|word| word == v || word.starts_with(&format!("{v} "))) + }) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +/// Test the full live policy update lifecycle: +/// +/// 1. Create sandbox with `--keep` +/// 2. Set policy A, verify initial version >= 1 +/// 3. Push same policy A -> version unchanged (idempotent) +/// 4. Push policy B (adds example.com) with `--wait` -> new version +/// 5. Push policy B again -> idempotent +/// 6. Verify policy list shows both versions +#[tokio::test] +#[allow(clippy::too_many_lines)] +async fn live_policy_update_round_trip() { + // --- Write two distinct policy files --- + let policy_a = write_policy(&["api.anthropic.com"]).expect("write policy A"); + let policy_b = + write_policy(&["api.anthropic.com", "example.com"]).expect("write policy B"); + + let policy_a_path = policy_a + .path() + .to_str() + .expect("policy A path should be utf-8") + .to_string(); + let policy_b_path = policy_b + .path() + .to_str() + .expect("policy B path should be utf-8") + .to_string(); + + // --- Create a long-running sandbox --- + let mut guard = SandboxGuard::create_keep( + &["sh", "-c", "echo Ready && sleep infinity"], + "Ready", + ) + .await + .expect("create keep sandbox"); + + // --- Set initial policy A --- + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &policy_a_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set A should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + // --- Verify initial policy version --- + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!( + r.success, + "policy get should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let initial_version = extract_version(&r.output) + .unwrap_or_else(|| panic!("could not parse version from policy get output:\n{}", r.output)); + assert!( + initial_version >= 1, + "initial policy version should be >= 1, got {initial_version}" + ); + + let initial_hash = extract_hash(&r.output); + + // --- Push same policy A again -> should be idempotent --- + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &policy_a_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set A (repeat) should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!(r.success, "policy get after repeat should succeed:\n{}", r.output); + + let repeat_version = extract_version(&r.output) + .unwrap_or_else(|| panic!("could not parse version after repeat:\n{}", r.output)); + assert_eq!( + repeat_version, initial_version, + "same policy should not bump version: expected {initial_version}, got {repeat_version}" + ); + + if let (Some(ih), Some(rh)) = (&initial_hash, &extract_hash(&r.output)) { + assert_eq!(ih, rh, "same policy should produce same hash"); + } + + // --- Push policy B -> should create new version --- + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &policy_b_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set B should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!(r.success, "policy get after B should succeed:\n{}", r.output); + + let new_version = extract_version(&r.output) + .unwrap_or_else(|| panic!("could not parse version after B:\n{}", r.output)); + assert!( + new_version > initial_version, + "different policy should bump version: expected > {initial_version}, got {new_version}" + ); + + if let (Some(ih), Some(nh)) = (&initial_hash, &extract_hash(&r.output)) { + assert_ne!(ih, nh, "different policy should produce different hash"); + } + + // --- Push policy B again -> idempotent --- + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &policy_b_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set B (repeat) should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!(r.success, "policy get after B repeat should succeed:\n{}", r.output); + + let repeat_b_version = extract_version(&r.output) + .unwrap_or_else(|| panic!("could not parse version after B repeat:\n{}", r.output)); + assert_eq!( + repeat_b_version, new_version, + "same policy B should not bump version: expected {new_version}, got {repeat_b_version}" + ); + + // --- Verify policy list shows revision history --- + let r = run_cli(&["policy", "list", &guard.name]).await; + assert!( + r.success, + "policy list should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + // Both versions should appear in the list output. + assert!( + list_output_contains_version(&r.output, new_version), + "policy list should contain version {new_version}:\n{}", + r.output + ); + assert!( + list_output_contains_version(&r.output, initial_version), + "policy list should contain initial version {initial_version}:\n{}", + r.output + ); + + guard.cleanup().await; +} + +/// Test live policy update from an initially empty network policy: +/// +/// 1. Create sandbox with `--keep` +/// 2. Set policy with no network rules +/// 3. Push policy with a network rule using `--wait` +/// 4. Verify the version bumped +#[tokio::test] +async fn live_policy_update_from_empty_network_policies() { + let empty_policy = write_empty_network_policy().expect("write empty network policy"); + let full_policy = write_policy(&["example.com"]).expect("write full policy"); + + let empty_path = empty_policy + .path() + .to_str() + .expect("empty policy path should be utf-8") + .to_string(); + let full_path = full_policy + .path() + .to_str() + .expect("full policy path should be utf-8") + .to_string(); + + // Create sandbox with empty network policy. + let mut guard = SandboxGuard::create_keep( + &["sh", "-c", "echo Ready && sleep infinity"], + "Ready", + ) + .await + .expect("create keep sandbox"); + + // Set initial empty policy. + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &empty_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set (empty) should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!(r.success, "policy get (empty) should succeed:\n{}", r.output); + + let initial_version = extract_version(&r.output) + .unwrap_or_else(|| panic!("could not parse version from empty policy:\n{}", r.output)); + + // Push policy with network rules. + let r = run_cli(&[ + "policy", "set", &guard.name, "--policy", &full_path, "--wait", "--timeout", "120", + ]) + .await; + assert!( + r.success, + "policy set (full) should succeed (exit {:?}):\n{}", + r.exit_code, r.output + ); + + let r = run_cli(&["policy", "get", &guard.name]).await; + assert!(r.success, "policy get (full) should succeed:\n{}", r.output); + + let new_version = extract_version(&r.output).unwrap_or_else(|| { + panic!( + "could not parse version after adding network rules:\n{}", + r.output + ) + }); + assert!( + new_version > initial_version, + "adding network rules should create new version > {initial_version}, got {new_version}" + ); + + guard.cleanup().await; +} From eea495e6b9002dc611cf73daa893fb13a1a24dce Mon Sep 17 00:00:00 2001 From: "John T. Myers" <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:32:59 -0700 Subject: [PATCH 41/45] fix: remediate 9 security findings from external audit (OS-15 through OS-23) (#744) * fix(install): restrict tar extraction to expected binary member Prevents CWE-22 path traversal by extracting only the expected APP_NAME member instead of the full archive contents. Adds --no-same-owner and --no-same-permissions for defense-in-depth. OS-20 * fix(deploy): quote registry credentials in YAML heredocs Wraps username/password values with a yaml_quote helper to prevent YAML injection from special characters in registry credentials (CWE-94). Applied to all three heredoc blocks that emit registries.yaml auth. OS-23 * fix(server): redact session token in SSH tunnel rate-limit log Logs only the last 4 characters of bearer tokens to prevent credential exposure in log aggregation systems (CWE-532). OS-18 * fix(server): escape gateway_display in auth connect page Applies html_escape() to the Host/X-Forwarded-Host header value before rendering it into the HTML template, preventing HTML injection (CWE-79). OS-17 * fix(server): prevent XSS via code param with validation and proper JS escaping Adds server-side validation rejecting confirmation codes that do not match the CLI-generated format, replaces manual JS string escaping with serde_json serialization (handling U+2028/U+2029 line terminators), and adds a Content-Security-Policy header with nonce-based script-src. OS-16 * fix(sandbox): add byte cap and idle timeout to streaming inference relay Prevents resource exhaustion from upstream inference endpoints that stream indefinitely or hold connections open. Adds a 32 MiB total body limit and 30-second per-chunk idle timeout (CWE-400). OS-21 * fix(policy): narrow port field from u32 to u16 to reject invalid values Prevents meaningless port values >65535 from being accepted in policy YAML definitions. The proto field remains uint32 (protobuf has no u16) with validation at the conversion boundary. OS-22 * fix(deps): migrate from archived serde_yaml to serde_yml Replaces serde_yaml 0.9 (archived, RUSTSEC-2024-0320) with serde_yml 0.0.12, a maintained API-compatible fork. All import sites updated across openshell-policy, openshell-sandbox, and openshell-router. OS-19 * fix(server): re-validate sandbox-submitted security_notes and cap hit_count The gateway now re-runs security heuristics on proposed policy chunks instead of trusting sandbox-provided security_notes, validates host wildcards, caps hit_count at 100, and clamps confidence to [0,1]. The TUI approve-all path is updated to use ApproveAllDraftChunks RPC which respects the security_notes filtering gate (CWE-284, confused deputy). OS-15 * chore: apply cargo fmt and update Cargo.lock for serde_yml --------- Co-authored-by: John Myers --- Cargo.lock | 45 +++++-- Cargo.toml | 2 +- crates/openshell-policy/Cargo.toml | 2 +- crates/openshell-policy/src/lib.rs | 39 ++++-- crates/openshell-router/Cargo.toml | 2 +- crates/openshell-router/src/config.rs | 2 +- crates/openshell-sandbox/Cargo.toml | 2 +- crates/openshell-sandbox/src/opa.rs | 2 +- crates/openshell-sandbox/src/proxy.rs | 33 ++++- crates/openshell-server/src/auth.rs | 139 +++++++++++++++++----- crates/openshell-server/src/grpc.rs | 64 ++++++++-- crates/openshell-server/src/ssh_tunnel.rs | 11 +- crates/openshell-tui/src/lib.rs | 75 ++++++------ deploy/docker/cluster-entrypoint.sh | 21 +++- install.sh | 2 +- 15 files changed, 326 insertions(+), 115 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7d20c9bd8..852d97a0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,7 +125,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -136,7 +136,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1283,7 +1283,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2497,6 +2497,16 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libyml" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3302702afa434ffa30847a83305f0a69d6abd74293b6554c18ec85c7ef30c980" +dependencies = [ + "anyhow", + "version_check", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -2691,7 +2701,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2902,7 +2912,7 @@ dependencies = [ "miette", "openshell-core", "serde", - "serde_yaml", + "serde_yml", ] [[package]] @@ -2922,7 +2932,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "serde_yaml", + "serde_yml", "tempfile", "thiserror 2.0.18", "tokio", @@ -2958,7 +2968,7 @@ dependencies = [ "rustls-pemfile", "seccompiler", "serde_json", - "serde_yaml", + "serde_yml", "sha2 0.10.9", "temp-env", "tempfile", @@ -4044,7 +4054,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4348,6 +4358,21 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "serde_yml" +version = "0.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd" +dependencies = [ + "indexmap 2.13.0", + "itoa", + "libyml", + "memchr", + "ryu", + "serde", + "version_check", +] + [[package]] name = "serdect" version = "0.4.2" @@ -4519,7 +4544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -4930,7 +4955,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix 1.1.4", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4fecf1940..08b699d47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] # Serialization serde = { version = "1", features = ["derive"] } serde_json = "1" -serde_yaml = "0.9" +serde_yml = "0.0.12" # HTTP client reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } diff --git a/crates/openshell-policy/Cargo.toml b/crates/openshell-policy/Cargo.toml index 311bb4e86..f26136c6b 100644 --- a/crates/openshell-policy/Cargo.toml +++ b/crates/openshell-policy/Cargo.toml @@ -13,7 +13,7 @@ repository.workspace = true [dependencies] openshell-core = { path = "../openshell-core" } serde = { workspace = true } -serde_yaml = { workspace = true } +serde_yml = { workspace = true } miette = { workspace = true } [lints] diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 7adb4dfda..9cf543bdf 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -82,11 +82,12 @@ struct NetworkEndpointDef { #[serde(default, skip_serializing_if = "String::is_empty")] host: String, /// Single port (backwards compat). Mutually exclusive with `ports`. + /// Uses `u16` to reject invalid values >65535 at parse time. #[serde(default, skip_serializing_if = "is_zero")] - port: u32, + port: u16, /// Multiple ports. When non-empty, this endpoint covers all listed ports. #[serde(default, skip_serializing_if = "Vec::is_empty")] - ports: Vec, + ports: Vec, #[serde(default, skip_serializing_if = "String::is_empty")] protocol: String, #[serde(default, skip_serializing_if = "String::is_empty")] @@ -101,7 +102,7 @@ struct NetworkEndpointDef { allowed_ips: Vec, } -fn is_zero(v: &u32) -> bool { +fn is_zero(v: &u16) -> bool { *v == 0 } @@ -169,10 +170,10 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { .map(|e| { // Normalize port/ports: ports takes precedence, else // single port is promoted to ports array. - let normalized_ports = if !e.ports.is_empty() { - e.ports + let normalized_ports: Vec = if !e.ports.is_empty() { + e.ports.into_iter().map(u32::from).collect() } else if e.port > 0 { - vec![e.port] + vec![u32::from(e.port)] } else { vec![] }; @@ -285,10 +286,12 @@ fn from_proto(policy: &SandboxPolicy) -> PolicyFile { .map(|e| { // Use compact form: if ports has exactly 1 element, // emit port (scalar). If >1, emit ports (array). + // Proto uses u32; YAML uses u16. Clamp at boundary. + let clamp = |v: u32| -> u16 { v.min(65535) as u16 }; let (port, ports) = if e.ports.len() > 1 { - (0, e.ports.clone()) + (0, e.ports.iter().map(|&p| clamp(p)).collect()) } else { - (e.ports.first().copied().unwrap_or(e.port), vec![]) + (clamp(e.ports.first().copied().unwrap_or(e.port)), vec![]) }; NetworkEndpointDef { host: e.host.clone(), @@ -358,7 +361,7 @@ fn from_proto(policy: &SandboxPolicy) -> PolicyFile { /// Parse a sandbox policy from a YAML string. pub fn parse_sandbox_policy(yaml: &str) -> Result { - let raw: PolicyFile = serde_yaml::from_str(yaml) + let raw: PolicyFile = serde_yml::from_str(yaml) .into_diagnostic() .wrap_err("failed to parse sandbox policy YAML")?; Ok(to_proto(raw)) @@ -371,7 +374,7 @@ pub fn parse_sandbox_policy(yaml: &str) -> Result { /// and is round-trippable through `parse_sandbox_policy`. pub fn serialize_sandbox_policy(policy: &SandboxPolicy) -> Result { let yaml_repr = from_proto(policy); - serde_yaml::to_string(&yaml_repr) + serde_yml::to_string(&yaml_repr) .into_diagnostic() .wrap_err("failed to serialize policy to YAML") } @@ -1207,4 +1210,20 @@ network_policies: proto2.network_policies["test"].endpoints[0].host ); } + + #[test] + fn rejects_port_above_65535() { + let yaml = r#" +version: 1 +network_policies: + test: + endpoints: + - host: example.com + port: 70000 +"#; + assert!( + parse_sandbox_policy(yaml).is_err(), + "port >65535 should fail to parse" + ); + } } diff --git a/crates/openshell-router/Cargo.toml b/crates/openshell-router/Cargo.toml index dc8e9c924..e4c3d5ea7 100644 --- a/crates/openshell-router/Cargo.toml +++ b/crates/openshell-router/Cargo.toml @@ -19,7 +19,7 @@ serde_json = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } tokio = { workspace = true } -serde_yaml = { workspace = true } +serde_yml = { workspace = true } uuid = { workspace = true } [dev-dependencies] diff --git a/crates/openshell-router/src/config.rs b/crates/openshell-router/src/config.rs index 52c22da9f..b531e091d 100644 --- a/crates/openshell-router/src/config.rs +++ b/crates/openshell-router/src/config.rs @@ -75,7 +75,7 @@ impl RouterConfig { path.display() )) })?; - let config: Self = serde_yaml::from_str(&content).map_err(|e| { + let config: Self = serde_yml::from_str(&content).map_err(|e| { RouterError::Internal(format!( "failed to parse router config {}: {e}", path.display() diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 68e696e95..e8e7e2c97 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -60,7 +60,7 @@ ipnet = "2" # Serialization serde_json = { workspace = true } -serde_yaml = { workspace = true } +serde_yml = { workspace = true } # Logging tracing = { workspace = true } diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index f1df12ff4..f1c0ad293 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -511,7 +511,7 @@ fn parse_process_policy(val: ®orus::Value) -> ProcessPolicy { /// Preprocess YAML policy data: parse, normalize, validate, expand access presets, return JSON. fn preprocess_yaml_data(yaml_str: &str) -> Result { - let mut data: serde_json::Value = serde_yaml::from_str(yaml_str) + let mut data: serde_json::Value = serde_yml::from_str(yaml_str) .map_err(|e| miette::miette!("failed to parse YAML data: {e}"))?; // Normalize port → ports for all endpoints so Rego always sees "ports" array. diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index a7df76e2f..9e87450d4 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -23,6 +23,12 @@ use tracing::{debug, info, warn}; const MAX_HEADER_BYTES: usize = 8192; const INFERENCE_LOCAL_HOST: &str = "inference.local"; +/// Maximum total bytes for a streaming inference response body (32 MiB). +const MAX_STREAMING_BODY: usize = 32 * 1024 * 1024; + +/// Idle timeout per chunk when relaying streaming inference responses. +const CHUNK_IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30); + /// Result of a proxy CONNECT policy decision. struct ConnectDecision { action: NetworkAction, @@ -1045,18 +1051,35 @@ async fn route_inference_request( let header_bytes = format_http_response_header(resp.status, &resp_headers); write_all(tls_client, &header_bytes).await?; - // Stream body chunks as they arrive from the upstream. + // Stream body chunks with byte cap and idle timeout. + let mut total_bytes: usize = 0; loop { - match resp.next_chunk().await { - Ok(Some(chunk)) => { + match tokio::time::timeout(CHUNK_IDLE_TIMEOUT, resp.next_chunk()).await { + Ok(Ok(Some(chunk))) => { + total_bytes += chunk.len(); + if total_bytes > MAX_STREAMING_BODY { + warn!( + total_bytes = total_bytes, + limit = MAX_STREAMING_BODY, + "streaming response exceeded byte limit, truncating" + ); + break; + } let encoded = format_chunk(&chunk); write_all(tls_client, &encoded).await?; } - Ok(None) => break, - Err(e) => { + Ok(Ok(None)) => break, + Ok(Err(e)) => { warn!(error = %e, "error reading upstream response chunk"); break; } + Err(_) => { + warn!( + idle_timeout_secs = CHUNK_IDLE_TIMEOUT.as_secs(), + "streaming response chunk idle timeout, closing" + ); + break; + } } } diff --git a/crates/openshell-server/src/auth.rs b/crates/openshell-server/src/auth.rs index 5a3229ffa..b896d062c 100644 --- a/crates/openshell-server/src/auth.rs +++ b/crates/openshell-server/src/auth.rs @@ -22,11 +22,28 @@ use axum::{ response::{Html, IntoResponse}, routing::get, }; +use http::header; use serde::Deserialize; use std::sync::Arc; use crate::ServerState; +/// Validate that a confirmation code matches the CLI-generated format. +/// +/// Codes are 3 alphanumeric characters, a dash, then 4 alphanumeric characters +/// (e.g., "AB7-X9KM"). The CLI generates these from the charset `[A-Z2-9]`. +fn is_valid_code(code: &str) -> bool { + let bytes = code.as_bytes(); + bytes.len() == 8 + && bytes[3] == b'-' + && bytes[..3] + .iter() + .all(|b| b.is_ascii_uppercase() || b.is_ascii_digit()) + && bytes[4..] + .iter() + .all(|b| b.is_ascii_uppercase() || b.is_ascii_digit()) +} + #[derive(Deserialize)] struct ConnectParams { callback_port: u16, @@ -54,6 +71,15 @@ async fn auth_connect( Query(params): Query, headers: HeaderMap, ) -> impl IntoResponse { + // Reject codes that don't match the CLI-generated format to prevent + // reflected XSS via crafted URLs. + if !is_valid_code(¶ms.code) { + return Html( + "

Invalid confirmation code format.

".to_string(), + ) + .into_response(); + } + let cf_token = headers .get("cookie") .and_then(|v| v.to_str().ok()) @@ -68,14 +94,34 @@ async fn auth_connect( .and_then(|v| v.to_str().ok()) .map_or_else(|| state.config.bind_address.to_string(), String::from); + let safe_gateway = html_escape(&gateway_display); + match cf_token { - Some(token) => Html(render_connect_page( - &gateway_display, - params.callback_port, - &token, - ¶ms.code, - )), - None => Html(render_waiting_page(params.callback_port, ¶ms.code)), + Some(token) => { + let nonce = uuid::Uuid::new_v4().to_string(); + let csp = format!( + "default-src 'none'; script-src 'nonce-{nonce}'; style-src 'unsafe-inline'; connect-src http://127.0.0.1:*" + ); + ( + [(header::CONTENT_SECURITY_POLICY, csp)], + Html(render_connect_page( + &safe_gateway, + params.callback_port, + &token, + ¶ms.code, + &nonce, + )), + ) + .into_response() + } + None => { + let csp = "default-src 'none'; style-src 'unsafe-inline'".to_string(); + ( + [(header::CONTENT_SECURITY_POLICY, csp)], + Html(render_waiting_page(params.callback_port, ¶ms.code)), + ) + .into_response() + } } } @@ -104,22 +150,27 @@ fn render_connect_page( callback_port: u16, cf_token: &str, code: &str, + nonce: &str, ) -> String { - // Escape the token for safe embedding in a JS string literal. - let escaped_token = cf_token - .replace('\\', "\\\\") - .replace('\'', "\\'") - .replace('"', "\\\"") - .replace('<', "\\x3c") - .replace('>', "\\x3e"); + // Use JSON serialization for JS-safe string embedding — handles all + // edge cases including \n, \r, U+2028, U+2029 that break JS string + // literals. serde_json::to_string produces a quoted JSON string + // (e.g., "value") which is a valid JS string literal. + // + // We additionally escape < and > to \u003c / \u003e because while + // they're valid in JSON, they're dangerous inside an HTML before the JS parser runs). + let json_token = serde_json::to_string(cf_token) + .unwrap_or_else(|_| "\"\"".to_string()) + .replace('<', "\\u003c") + .replace('>', "\\u003e"); + let json_code = serde_json::to_string(code) + .unwrap_or_else(|_| "\"\"".to_string()) + .replace('<', "\\u003c") + .replace('>', "\\u003e"); - // Escape the code the same way (it's alphanumeric + dash, but be safe). - let escaped_code = code - .replace('\\', "\\\\") - .replace('\'', "\\'") - .replace('"', "\\\"") - .replace('<', "\\x3c") - .replace('>', "\\x3e"); + // HTML-safe version of the code for display in the page body. + let html_code = html_escape(code); let version = openshell_core::VERSION; @@ -250,7 +301,7 @@ fn render_connect_page(
Connect to Gateway
Confirmation Code
-
{escaped_code}
+
{html_code}
Verify this matches the code shown in your terminal
@@ -271,9 +322,9 @@ fn render_connect_page(
- ", "ABC-1234"); - // < and > should be escaped + let html = render_connect_page( + "gw", + 1234, + "token", + "ABC-1234", + "nonce", + ); + // < and > should be escaped via JSON encoding (\u003c) assert!(!html.contains("