diff --git a/architecture/gateway-security.md b/architecture/gateway-security.md index bb54ed413..e708ab00f 100644 --- a/architecture/gateway-security.md +++ b/architecture/gateway-security.md @@ -229,7 +229,31 @@ SSH connections into sandboxes pass through the gateway's HTTP CONNECT tunnel at | `x-sandbox-id` | Identifies the target sandbox | | `x-sandbox-token` | Session token (created via `CreateSshSession` RPC) | -The gateway validates the token against the stored `SshSession` record, checks that it has not been revoked, and confirms the `sandbox_id` matches. +The gateway validates the token against the stored `SshSession` record and checks: + +1. The token has not been revoked. +2. The `sandbox_id` matches the request header. +3. The token has not expired (`expires_at_ms` check; 0 means no expiry for backward compatibility). + +### Session Lifecycle + +SSH session tokens have a configurable TTL (`ssh_session_ttl_secs`, default 24 hours). The `expires_at_ms` field is set at creation time and checked on every tunnel request. Setting the TTL to 0 disables expiry. + +Sessions are cleaned up automatically: + +- **On sandbox deletion**: all SSH sessions for the deleted sandbox are removed from the store. +- **Background reaper**: a periodic task (hourly) deletes expired and revoked session records to prevent unbounded database growth. + +### Connection Limits + +The gateway enforces two concurrent connection limits to bound the impact of credential misuse: + +| Limit | Value | Purpose | +|---|---|---| +| Per-token | 10 concurrent tunnels | Limits damage from a single leaked token | +| Per-sandbox | 20 concurrent tunnels | Prevents bypass via creating many tokens for one sandbox | + +These limits are tracked in-memory and decremented when tunnels close. Exceeding either limit returns HTTP 429 (Too Many Requests). ### NSSH1 Handshake @@ -362,6 +386,7 @@ This section defines the primary attacker profiles, what the current design prot | Weak cryptoperiod | Certificates are effectively non-expiring by default | | Limited fine-grained revocation | CA private key is not persisted; rotation is coarse-grained | | Local credential theft risk | CLI mTLS key material is stored on developer filesystem | +| SSH token + mTLS = persistent access within trust boundary | SSH tokens expire after 24h (configurable) and are capped at 3 concurrent connections per token / 20 per sandbox, but within the mTLS trust boundary a stolen token remains usable until TTL expires | ### Out of Scope / Not Defended By This Layer diff --git a/crates/navigator-core/src/config.rs b/crates/navigator-core/src/config.rs index a71e72959..4fd136163 100644 --- a/crates/navigator-core/src/config.rs +++ b/crates/navigator-core/src/config.rs @@ -61,6 +61,10 @@ pub struct Config { #[serde(default = "default_ssh_handshake_skew_secs")] pub ssh_handshake_skew_secs: u64, + /// TTL for SSH session tokens, in seconds. 0 disables expiry. + #[serde(default = "default_ssh_session_ttl_secs")] + pub ssh_session_ttl_secs: u64, + /// Kubernetes secret name containing client TLS materials for sandbox pods. /// When set, sandbox pods get this secret mounted so they can connect to /// the server over mTLS. @@ -103,6 +107,7 @@ impl Config { sandbox_ssh_port: default_sandbox_ssh_port(), ssh_handshake_secret: String::new(), ssh_handshake_skew_secs: default_ssh_handshake_skew_secs(), + ssh_session_ttl_secs: default_ssh_session_ttl_secs(), client_tls_secret_name: String::new(), } } @@ -191,6 +196,13 @@ impl Config { self } + /// Create a new configuration with the SSH session TTL. + #[must_use] + pub const fn with_ssh_session_ttl_secs(mut self, secs: u64) -> Self { + self.ssh_session_ttl_secs = secs; + self + } + /// Set the Kubernetes secret name for sandbox client TLS materials. #[must_use] pub fn with_client_tls_secret_name(mut self, name: impl Into) -> Self { @@ -230,3 +242,7 @@ const fn default_sandbox_ssh_port() -> u16 { const fn default_ssh_handshake_skew_secs() -> u64 { 300 } + +const fn default_ssh_session_ttl_secs() -> u64 { + 86400 // 24 hours +} diff --git a/crates/navigator-server/src/grpc.rs b/crates/navigator-server/src/grpc.rs index 0a733c47c..97275a90b 100644 --- a/crates/navigator-server/src/grpc.rs +++ b/crates/navigator-server/src/grpc.rs @@ -544,6 +544,33 @@ impl Navigator for NavigatorService { self.state.sandbox_index.update_from_sandbox(&sandbox); self.state.sandbox_watch_bus.notify(&id); + // Clean up SSH sessions associated with this sandbox. + if let Ok(records) = self + .state + .store + .list(SshSession::object_type(), 1000, 0) + .await + { + for record in records { + if let Ok(session) = SshSession::decode(record.payload.as_slice()) { + if session.sandbox_id == id { + if let Err(e) = self + .state + .store + .delete(SshSession::object_type(), &session.id) + .await + { + warn!( + session_id = %session.id, + error = %e, + "Failed to delete SSH session during sandbox cleanup" + ); + } + } + } + } + } + let deleted = match self.state.sandbox_client.delete(&sandbox.name).await { Ok(deleted) => deleted, Err(err) => { @@ -787,14 +814,21 @@ impl Navigator for NavigatorService { } let token = uuid::Uuid::new_v4().to_string(); + let now_ms = current_time_ms() + .map_err(|e| Status::internal(format!("timestamp generation failed: {e}")))?; + let expires_at_ms = if self.state.config.ssh_session_ttl_secs > 0 { + now_ms + (self.state.config.ssh_session_ttl_secs as i64 * 1000) + } else { + 0 + }; let session = SshSession { id: token.clone(), sandbox_id: req.sandbox_id.clone(), token: token.clone(), - created_at_ms: current_time_ms() - .map_err(|e| Status::internal(format!("timestamp generation failed: {e}")))?, + created_at_ms: now_ms, revoked: false, name: generate_name(), + expires_at_ms, }; self.state @@ -814,6 +848,7 @@ impl Navigator for NavigatorService { gateway_scheme: scheme.to_string(), connect_path: self.state.config.ssh_connect_path.clone(), host_key_fingerprint: String::new(), + expires_at_ms, })) } diff --git a/crates/navigator-server/src/lib.rs b/crates/navigator-server/src/lib.rs index 7c7a4683b..f6e49c6dd 100644 --- a/crates/navigator-server/src/lib.rs +++ b/crates/navigator-server/src/lib.rs @@ -22,7 +22,8 @@ mod tls; pub mod tracing_bus; use navigator_core::{Config, Error, Result}; -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; use tracing::{error, info}; @@ -56,6 +57,12 @@ pub struct ServerState { /// In-memory bus for server process logs. pub tracing_log_bus: TracingLogBus, + + /// Active SSH tunnel connection counts per session token. + pub ssh_connections_by_token: Mutex>, + + /// Active SSH tunnel connection counts per sandbox id. + pub ssh_connections_by_sandbox: Mutex>, } impl ServerState { @@ -76,6 +83,8 @@ impl ServerState { sandbox_index, sandbox_watch_bus, tracing_log_bus, + ssh_connections_by_token: Mutex::new(HashMap::new()), + ssh_connections_by_sandbox: Mutex::new(HashMap::new()), } } } @@ -138,6 +147,7 @@ pub async fn run_server(config: Config, tracing_log_bus: TracingLogBus) -> Resul state.tracing_log_bus.clone(), ); spawn_kube_event_tailer(state.clone()); + ssh_tunnel::spawn_session_reaper(store.clone(), std::time::Duration::from_secs(3600)); // Create the multiplexed service let service = MultiplexService::new(state.clone()); diff --git a/crates/navigator-server/src/ssh_tunnel.rs b/crates/navigator-server/src/ssh_tunnel.rs index 24eae0221..7fea35739 100644 --- a/crates/navigator-server/src/ssh_tunnel.rs +++ b/crates/navigator-server/src/ssh_tunnel.rs @@ -9,6 +9,7 @@ use hyper::Request; use hyper::upgrade::Upgraded; use hyper_util::rt::TokioIo; use navigator_core::proto::{Sandbox, SandboxPhase, SshSession}; +use prost::Message; use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; @@ -18,12 +19,18 @@ use tracing::{info, warn}; use uuid::Uuid; use crate::ServerState; -use crate::persistence::{ObjectId, ObjectName, ObjectType}; +use crate::persistence::{ObjectId, ObjectName, ObjectType, Store}; const HEADER_SANDBOX_ID: &str = "x-sandbox-id"; const HEADER_TOKEN: &str = "x-sandbox-token"; const PREFACE_MAGIC: &str = "NSSH1"; +/// Maximum concurrent SSH tunnel connections per session token. +const MAX_CONNECTIONS_PER_TOKEN: u32 = 3; + +/// Maximum concurrent SSH tunnel connections per sandbox. +const MAX_CONNECTIONS_PER_SANDBOX: u32 = 20; + pub fn router(state: Arc) -> Router { Router::new() .route("/connect/ssh", any(ssh_connect)) @@ -60,6 +67,17 @@ async fn ssh_connect( return StatusCode::UNAUTHORIZED.into_response(); } + // Check token expiry (0 means no expiry for backward compatibility). + if session.expires_at_ms > 0 { + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as i64; + if now_ms > session.expires_at_ms { + return StatusCode::UNAUTHORIZED.into_response(); + } + } + let sandbox = match state.store.get_message::(&sandbox_id).await { Ok(Some(sandbox)) => sandbox, Ok(None) => return StatusCode::NOT_FOUND.into_response(), @@ -93,8 +111,40 @@ async fn ssh_connect( } else { return StatusCode::PRECONDITION_FAILED.into_response(); }; + // Enforce per-token concurrent connection limit. + { + let mut counts = state.ssh_connections_by_token.lock().unwrap(); + let count = counts.entry(token.clone()).or_insert(0); + if *count >= MAX_CONNECTIONS_PER_TOKEN { + warn!(token = %token, "SSH tunnel: per-token connection limit reached"); + return StatusCode::TOO_MANY_REQUESTS.into_response(); + } + *count += 1; + } + + // Enforce per-sandbox concurrent connection limit. + { + let mut counts = state.ssh_connections_by_sandbox.lock().unwrap(); + let count = counts.entry(sandbox_id.clone()).or_insert(0); + if *count >= MAX_CONNECTIONS_PER_SANDBOX { + // Roll back the per-token increment. + let mut token_counts = state.ssh_connections_by_token.lock().unwrap(); + if let Some(c) = token_counts.get_mut(&token) { + *c = c.saturating_sub(1); + if *c == 0 { + token_counts.remove(&token); + } + } + warn!(sandbox_id = %sandbox_id, "SSH tunnel: per-sandbox connection limit reached"); + return StatusCode::TOO_MANY_REQUESTS.into_response(); + } + *count += 1; + } + let handshake_secret = state.config.ssh_handshake_secret.clone(); let sandbox_id_clone = sandbox_id.clone(); + let token_clone = token.clone(); + let state_clone = state.clone(); let upgrade = hyper::upgrade::on(req); tokio::spawn(async move { @@ -103,7 +153,7 @@ async fn ssh_connect( if let Err(err) = handle_tunnel( &mut upgraded, connect_target, - &token, + &token_clone, &handshake_secret, &sandbox_id_clone, ) @@ -116,6 +166,10 @@ async fn ssh_connect( warn!(error = %err, "SSH upgrade failed"); } } + + // Decrement connection counts on tunnel completion. + decrement_connection_count(&state_clone.ssh_connections_by_token, &token_clone); + decrement_connection_count(&state_clone.ssh_connections_by_sandbox, &sandbox_id_clone); }); StatusCode::OK.into_response() @@ -293,3 +347,259 @@ enum ConnectTarget { Ip(SocketAddr), Host(String, u16), } + +/// Decrement a connection count entry, removing it if it reaches zero. +fn decrement_connection_count( + counts: &std::sync::Mutex>, + key: &str, +) { + let mut map = counts.lock().unwrap(); + if let Some(count) = map.get_mut(key) { + *count = count.saturating_sub(1); + if *count == 0 { + map.remove(key); + } + } +} + +/// Spawn a background task that periodically reaps expired and revoked SSH sessions. +pub fn spawn_session_reaper(store: Arc, interval: Duration) { + tokio::spawn(async move { + // Initial delay to let startup settle. + tokio::time::sleep(interval).await; + + loop { + if let Err(e) = reap_expired_sessions(&store).await { + warn!(error = %e, "SSH session reaper sweep failed"); + } + tokio::time::sleep(interval).await; + } + }); +} + +async fn reap_expired_sessions(store: &Store) -> Result<(), String> { + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as i64; + + let records = store + .list(SshSession::object_type(), 1000, 0) + .await + .map_err(|e| e.to_string())?; + + let mut reaped = 0u32; + for record in records { + let session: SshSession = match Message::decode(record.payload.as_slice()) { + Ok(s) => s, + Err(_) => continue, + }; + + let should_delete = + // Expired sessions (expires_at_ms > 0 means expiry is set). + (session.expires_at_ms > 0 && now_ms > session.expires_at_ms) + // Revoked sessions — already invalidated, just cleaning up storage. + || session.revoked; + + if should_delete { + if let Err(e) = store.delete(SshSession::object_type(), &session.id).await { + warn!(session_id = %session.id, error = %e, "Failed to reap SSH session"); + } else { + reaped += 1; + } + } + } + + if reaped > 0 { + info!(count = reaped, "SSH session reaper: cleaned up sessions"); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::persistence::Store; + use std::collections::HashMap; + use std::sync::Mutex; + + fn make_session(id: &str, sandbox_id: &str, expires_at_ms: i64, revoked: bool) -> SshSession { + SshSession { + id: id.to_string(), + sandbox_id: sandbox_id.to_string(), + token: id.to_string(), + created_at_ms: 1000, + revoked, + name: format!("session-{id}"), + expires_at_ms, + } + } + + fn now_ms() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as i64 + } + + // ---- Connection limit tests ---- + + #[test] + fn decrement_removes_entry_at_zero() { + let counts: Mutex> = Mutex::new(HashMap::new()); + counts.lock().unwrap().insert("tok1".to_string(), 1); + decrement_connection_count(&counts, "tok1"); + assert!(counts.lock().unwrap().is_empty()); + } + + #[test] + fn decrement_reduces_count() { + let counts: Mutex> = Mutex::new(HashMap::new()); + counts.lock().unwrap().insert("tok1".to_string(), 5); + decrement_connection_count(&counts, "tok1"); + assert_eq!(*counts.lock().unwrap().get("tok1").unwrap(), 4); + } + + #[test] + fn decrement_missing_key_is_noop() { + let counts: Mutex> = Mutex::new(HashMap::new()); + decrement_connection_count(&counts, "nonexistent"); + assert!(counts.lock().unwrap().is_empty()); + } + + #[test] + fn per_token_connection_limit_enforced() { + let counts: Mutex> = Mutex::new(HashMap::new()); + counts + .lock() + .unwrap() + .insert("tok1".to_string(), MAX_CONNECTIONS_PER_TOKEN); + let current = *counts.lock().unwrap().get("tok1").unwrap(); + assert!(current >= MAX_CONNECTIONS_PER_TOKEN); + } + + #[test] + fn per_sandbox_connection_limit_enforced() { + let counts: Mutex> = Mutex::new(HashMap::new()); + counts + .lock() + .unwrap() + .insert("sbx1".to_string(), MAX_CONNECTIONS_PER_SANDBOX); + let current = *counts.lock().unwrap().get("sbx1").unwrap(); + assert!(current >= MAX_CONNECTIONS_PER_SANDBOX); + } + + // ---- Session reaper tests ---- + + #[tokio::test] + async fn reaper_deletes_expired_sessions() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + let expired = make_session("expired1", "sbx1", now_ms() - 60_000, false); + store.put_message(&expired).await.unwrap(); + + let valid = make_session("valid1", "sbx1", now_ms() + 3_600_000, false); + store.put_message(&valid).await.unwrap(); + + reap_expired_sessions(&store).await.unwrap(); + + assert!( + store + .get_message::("expired1") + .await + .unwrap() + .is_none(), + "expired session should be reaped" + ); + assert!( + store + .get_message::("valid1") + .await + .unwrap() + .is_some(), + "valid session should be kept" + ); + } + + #[tokio::test] + async fn reaper_deletes_revoked_sessions() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + let revoked = make_session("revoked1", "sbx1", 0, true); + store.put_message(&revoked).await.unwrap(); + + let active = make_session("active1", "sbx1", 0, false); + store.put_message(&active).await.unwrap(); + + reap_expired_sessions(&store).await.unwrap(); + + assert!( + store + .get_message::("revoked1") + .await + .unwrap() + .is_none(), + "revoked session should be reaped" + ); + assert!( + store + .get_message::("active1") + .await + .unwrap() + .is_some(), + "active session should be kept" + ); + } + + #[tokio::test] + async fn reaper_preserves_zero_expiry_sessions() { + let store = Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(); + + // expires_at_ms = 0 means no expiry (backward compatible). + let no_expiry = make_session("noexpiry1", "sbx1", 0, false); + store.put_message(&no_expiry).await.unwrap(); + + reap_expired_sessions(&store).await.unwrap(); + + assert!( + store + .get_message::("noexpiry1") + .await + .unwrap() + .is_some(), + "session with no expiry should be preserved" + ); + } + + // ---- Expiry validation logic tests ---- + + #[test] + fn expired_session_is_detected() { + let session = make_session("tok1", "sbx1", now_ms() - 1000, false); + let is_expired = session.expires_at_ms > 0 && now_ms() > session.expires_at_ms; + assert!(is_expired, "session in the past should be expired"); + } + + #[test] + fn future_session_is_not_expired() { + let session = make_session("tok1", "sbx1", now_ms() + 3_600_000, false); + let is_expired = session.expires_at_ms > 0 && now_ms() > session.expires_at_ms; + assert!(!is_expired, "session in the future should not be expired"); + } + + #[test] + fn zero_expiry_is_not_expired() { + let session = make_session("tok1", "sbx1", 0, false); + let is_expired = session.expires_at_ms > 0 && now_ms() > session.expires_at_ms; + assert!( + !is_expired, + "session with zero expiry should never be expired" + ); + } +} diff --git a/proto/navigator.proto b/proto/navigator.proto index b6513fb92..1232746f1 100644 --- a/proto/navigator.proto +++ b/proto/navigator.proto @@ -171,6 +171,9 @@ message CreateSshSessionResponse { // Optional host key fingerprint. string host_key_fingerprint = 7; + + // Expiry timestamp in milliseconds since epoch. 0 means no expiry. + int64 expires_at_ms = 8; } // Revoke SSH session request. @@ -249,6 +252,10 @@ message SshSession { // Human-friendly name (auto-generated if not provided). string name = 6; + + // Expiry timestamp in milliseconds since epoch. 0 means no expiry + // (backward-compatible default for sessions created before this field existed). + int64 expires_at_ms = 7; } // Watch sandbox request.