From 6ad9a2b6b3b29725b5e01f81575bef7885c53567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Wed, 6 May 2026 14:20:16 +0200 Subject: [PATCH 1/2] =?UTF-8?q?feat(rook):=20operational=20parity=20?= =?UTF-8?q?=E2=80=93=20undercover=20mode,=20debug=20diagnostics,=20multi-p?= =?UTF-8?q?rovider=20routing=20controls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add OperationalConfig { undercover, debug_diagnostics } with TOML parsing, env overlay (ROOK_UNDERCOVER, ROOK_DEBUG_DIAGNOSTICS), partial overlay, and export view - Thread OperationalConfig through RookConfig → ServerConfig → GatewayState and AdminState - Add emit_gateway_debug / debug_diagnostics_enabled helpers in gateway/handlers.rs; instrument buffered and streaming execution paths - Sanitise tracing warn/info with normalize_model_label on all hot paths - Expose OperationalStatusView { undercover, debug_diagnostics, redaction_baseline: always_on } nested in OperatorStatusView via /status - Add regression tests: operational_config_propagates_to_admin_state (server/mod.rs) emit_gateway_debug_is_silent_when_debug_diagnostics_disabled (gateway/handlers.rs) - Update openspec/specs/gateway/spec.md with #538 requirements Closes #538 --- clients/rook/src/admin/handlers.rs | 14 ++- clients/rook/src/admin/mod.rs | 3 + clients/rook/src/admin/types.rs | 8 ++ clients/rook/src/config/mod.rs | 138 +++++++++++++++++++-- clients/rook/src/gateway/handlers.rs | 135 ++++++++++++++++++-- clients/rook/src/gateway/mod.rs | 2 + clients/rook/src/main.rs | 1 + clients/rook/src/server/mod.rs | 41 ++++++- openspec/specs/gateway/spec.md | 177 +++++++++++++++++++++++++++ 9 files changed, 498 insertions(+), 21 deletions(-) diff --git a/clients/rook/src/admin/handlers.rs b/clients/rook/src/admin/handlers.rs index bbb8ee25..8b6c35db 100644 --- a/clients/rook/src/admin/handlers.rs +++ b/clients/rook/src/admin/handlers.rs @@ -2,10 +2,10 @@ use crate::admin::{ types::{ AccountView, AddPoolMemberRequest, AdminErrorResponse, AuditEventView, CreateAccountRequest, CreatePoolRequest, CreateRouteRequest, HealthAccountView, - HealthSummaryView, ListAuditEventsQuery, OperatorRuntimeView, OperatorStatusView, PoolView, - RouteView, SettingsView, UpdateAccountRequest, UpdatePoolRequest, UpdateRouteRequest, - UpdateSettingsRequest, UsageAggregateView, UsageGroupView, UsageSummaryPeriod, - UsageSummaryView, UsageSummaryWindowView, + HealthSummaryView, ListAuditEventsQuery, OperationalStatusView, OperatorRuntimeView, + OperatorStatusView, PoolView, RouteView, SettingsView, UpdateAccountRequest, + UpdatePoolRequest, UpdateRouteRequest, UpdateSettingsRequest, UsageAggregateView, + UsageGroupView, UsageSummaryPeriod, UsageSummaryView, UsageSummaryWindowView, }, AdminState, }; @@ -276,6 +276,11 @@ pub async fn handle_operator_status( metrics_enabled: true, usage_accounting_enabled: true, }, + operational: OperationalStatusView { + undercover: state.operational.undercover, + debug_diagnostics: state.operational.debug_diagnostics, + redaction_baseline: "always_on", + }, }), )) } @@ -962,6 +967,7 @@ mod tests { observability: std::sync::Arc::new( crate::observability::Observability::bootstrap(), ), + operational: crate::config::OperationalConfig::default(), }), req, ) diff --git a/clients/rook/src/admin/mod.rs b/clients/rook/src/admin/mod.rs index e43d5ce0..2269cc9a 100644 --- a/clients/rook/src/admin/mod.rs +++ b/clients/rook/src/admin/mod.rs @@ -1,6 +1,7 @@ pub mod handlers; pub mod types; +use crate::config::OperationalConfig; use crate::health::StartupDependencyState; use crate::observability::Observability; use crate::registry::RookRegistry; @@ -12,6 +13,7 @@ pub struct AdminState { pub registry: RookRegistry, pub startup: Arc, pub observability: Arc, + pub operational: OperationalConfig, } impl FromRef for Arc { @@ -156,6 +158,7 @@ mod tests { registry, startup: std::sync::Arc::new(startup), observability: std::sync::Arc::new(crate::observability::Observability::bootstrap()), + operational: crate::config::OperationalConfig::default(), } } diff --git a/clients/rook/src/admin/types.rs b/clients/rook/src/admin/types.rs index b4a8e9b8..1c4bc685 100644 --- a/clients/rook/src/admin/types.rs +++ b/clients/rook/src/admin/types.rs @@ -151,6 +151,7 @@ pub struct OperatorStatusView { pub startup: crate::health::ReadinessResponse, pub provider_health: HealthSummaryView, pub runtime: OperatorRuntimeView, + pub operational: OperationalStatusView, } #[derive(Debug, Clone, PartialEq, Eq, Serialize)] @@ -159,6 +160,13 @@ pub struct OperatorRuntimeView { pub usage_accounting_enabled: bool, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct OperationalStatusView { + pub undercover: bool, + pub debug_diagnostics: bool, + pub redaction_baseline: &'static str, +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RoutingPolicyView { diff --git a/clients/rook/src/config/mod.rs b/clients/rook/src/config/mod.rs index 2cc79131..8053296b 100644 --- a/clients/rook/src/config/mod.rs +++ b/clients/rook/src/config/mod.rs @@ -50,6 +50,7 @@ pub struct RookConfig { pub port: u16, pub enable_tui: bool, pub db_path: PathBuf, + pub operational: OperationalConfig, pub inbound_auth: InboundAuthConfig, pub transport: TransportConfig, pub rate_limits: RateLimitConfig, @@ -64,6 +65,7 @@ impl Default for RookConfig { port: 4141, enable_tui: false, db_path: PathBuf::from("./rook.db"), + operational: OperationalConfig::default(), inbound_auth: InboundAuthConfig::default(), transport: TransportConfig::default(), rate_limits: RateLimitConfig::default(), @@ -80,13 +82,22 @@ pub struct PartialRookConfig { pub port: Option, pub enable_tui: Option, pub db_path: Option, + pub operational: Option, pub inbound_auth: Option, + pub transport: Option, pub rate_limits: Option, pub idempotency: Option, pub upstream_resilience: Option, } +#[derive(Debug, Clone, Default, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct PartialOperationalConfig { + pub undercover: Option, + pub debug_diagnostics: Option, +} + #[derive(Debug, Clone, Default, Deserialize)] #[serde(default, deny_unknown_fields)] pub struct PartialInboundAuthConfig { @@ -149,6 +160,13 @@ pub struct PartialIdempotencyConfig { pub chat_completions: Option, } +#[derive(Debug, Clone, Default, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct PartialChatCompletionsIdempotencyConfig { + pub enabled: Option, + pub replay_window_seconds: Option, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub struct UpstreamResilienceConfig { pub max_buffered_attempts: usize, @@ -209,13 +227,6 @@ pub struct PartialUpstreamResilienceConfig { pub max_concurrent_upstream_requests: Option, } -#[derive(Debug, Clone, Default, Deserialize)] -#[serde(default, deny_unknown_fields)] -pub struct PartialChatCompletionsIdempotencyConfig { - pub enabled: Option, - pub replay_window_seconds: Option, -} - #[derive(Debug, Clone, Default)] pub struct CliRookConfigOverlay { pub host: Option, @@ -331,6 +342,7 @@ impl RookConfig { port: self.port, enable_tui: self.enable_tui, db_path: Some(self.db_path.display().to_string()), + operational: self.operational.clone(), inbound_auth: self.inbound_auth.clone(), transport: self.transport.clone(), rate_limits: self.rate_limits.clone(), @@ -367,6 +379,9 @@ impl PartialRookConfig { if let Some(db_path) = self.db_path { target.db_path = db_path; } + if let Some(operational) = self.operational { + operational.apply_to(&mut target.operational); + } if let Some(inbound_auth) = self.inbound_auth { inbound_auth.apply_to(&mut target.inbound_auth); } @@ -385,6 +400,17 @@ impl PartialRookConfig { } } +impl PartialOperationalConfig { + fn apply_to(self, target: &mut OperationalConfig) { + if let Some(undercover) = self.undercover { + target.undercover = undercover; + } + if let Some(debug_diagnostics) = self.debug_diagnostics { + target.debug_diagnostics = debug_diagnostics; + } + } +} + impl PartialInboundAuthConfig { fn apply_to(self, target: &mut InboundAuthConfig) { if let Some(enabled) = self.enabled { @@ -526,6 +552,7 @@ impl CliRookConfigOverlay { port: self.port, enable_tui: self.enable_tui, db_path: self.db_path, + operational: None, inbound_auth: self.inbound_auth, transport: self.transport, rate_limits: self.rate_limits, @@ -556,6 +583,16 @@ fn parse_env_overlay(env: &HashMap) -> Result bool; } +impl PartialOverlay for PartialOperationalConfig { + fn is_empty(&self) -> bool { + self.undercover.is_none() && self.debug_diagnostics.is_none() + } +} + impl PartialOverlay for PartialInboundAuthConfig { fn is_empty(&self) -> bool { self.enabled.is_none() && self.bearer_token.is_none() @@ -809,6 +852,7 @@ pub struct RookConfigExportView { pub port: u16, pub enable_tui: bool, pub db_path: String, + pub operational: OperationalExportView, pub inbound_auth: InboundAuthExportView, pub transport: TransportExportView, pub rate_limits: RateLimitExportView, @@ -816,6 +860,13 @@ pub struct RookConfigExportView { pub upstream_resilience: UpstreamResilienceConfigExportView, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct OperationalExportView { + pub undercover: bool, + pub debug_diagnostics: bool, + pub redaction_baseline: &'static str, +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize)] pub struct InboundAuthExportView { pub enabled: bool, @@ -932,6 +983,7 @@ impl RookConfigExportView { port: config.port, enable_tui: config.enable_tui, db_path: config.db_path.display().to_string(), + operational: OperationalExportView::from(&config.operational), inbound_auth: InboundAuthExportView { enabled: config.inbound_auth.enabled, bearer_token: if config.inbound_auth.enabled { @@ -998,6 +1050,33 @@ impl RookConfigExportView { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OperationalConfig { + /// Keep secret/payload redaction enabled across diagnostics and operator surfaces. + pub undercover: bool, + /// Emit extra metadata-only diagnostics for agent/gateway debugging. + pub debug_diagnostics: bool, +} + +impl Default for OperationalConfig { + fn default() -> Self { + Self { + undercover: true, + debug_diagnostics: false, + } + } +} + +impl From<&OperationalConfig> for OperationalExportView { + fn from(config: &OperationalConfig) -> Self { + Self { + undercover: config.undercover, + debug_diagnostics: config.debug_diagnostics, + redaction_baseline: "always_on", + } + } +} + #[derive(Clone, PartialEq, Eq, Default)] pub struct InboundAuthConfig { pub enabled: bool, @@ -1457,6 +1536,51 @@ mod tests { ); } + #[test] + fn operational_config_defaults_to_undercover_enabled_and_debug_disabled() { + let config = super::RookConfig::default(); + + assert!(config.operational.undercover); + assert!(!config.operational.debug_diagnostics); + } + + #[test] + fn operational_config_loads_from_file_and_env() { + let from_file = super::RookConfig::from_toml_str( + r#" +[operational] +undercover = false +debug_diagnostics = true +"#, + ) + .expect("operational config should parse"); + assert!(!from_file.operational.undercover); + assert!(from_file.operational.debug_diagnostics); + + let env = std::collections::HashMap::from([ + ("ROOK_UNDERCOVER".to_string(), "true".to_string()), + ("ROOK_DEBUG_DIAGNOSTICS".to_string(), "false".to_string()), + ]); + let from_env = super::RookConfig::from_sources(Some("[operational]\nundercover = false\ndebug_diagnostics = true\n"), &env) + .expect("env operational overrides should parse"); + + assert!(from_env.operational.undercover); + assert!(!from_env.operational.debug_diagnostics); + } + + #[test] + fn config_export_view_includes_operational_controls_without_disabling_redaction_baseline() { + let mut config = super::RookConfig::default(); + config.operational.undercover = false; + config.operational.debug_diagnostics = true; + + let view = super::RookConfigExportView::from_config(&config); + + assert!(!view.operational.undercover); + assert!(view.operational.debug_diagnostics); + assert_eq!(view.operational.redaction_baseline, "always_on"); + } + #[test] fn config_export_view_includes_upstream_resilience() { let mut config = super::RookConfig::default(); diff --git a/clients/rook/src/gateway/handlers.rs b/clients/rook/src/gateway/handlers.rs index daa734c6..b97fcfdb 100644 --- a/clients/rook/src/gateway/handlers.rs +++ b/clients/rook/src/gateway/handlers.rs @@ -86,6 +86,33 @@ fn record_upstream_retry_outcome( ); } +fn debug_diagnostics_enabled(state: &GatewayState) -> bool { + state.operational.debug_diagnostics +} + +fn emit_gateway_debug( + state: &GatewayState, + event: &'static str, + request_model: &str, + context: &UpstreamMetricContext, + outcome: Option<&'static str>, +) { + if !debug_diagnostics_enabled(state) { + return; + } + + tracing::debug!( + event, + requested_model = %normalize_model_label(Some(request_model)), + vendor = %context.vendor, + account = %context.account, + routed_model = %context.model, + outcome = outcome.unwrap_or("none"), + redaction_baseline = "always_on", + "gateway.debug" + ); +} + fn classify_upstream_error(error: &UpstreamError) -> &'static str { match error { UpstreamError::MissingBaseUrl { .. } | UpstreamError::MissingAuthHeader { .. } => { @@ -247,7 +274,11 @@ async fn handle_buffered_chat_completions( loop { attempts = attempts.saturating_add(1); let decision = match state.engine.resolve(&request.model).await { - Ok(decision) => decision, + Ok(decision) => { + let context = UpstreamMetricContext::from_decision(&decision); + emit_gateway_debug(state, "route_resolved", &request.model, &context, None); + decision + } Err(error) => { return handle_buffered_route_error(state, &request, started_at, error, last_error) .await; @@ -299,8 +330,10 @@ async fn handle_buffered_route_error( last_error: Option, ) -> Response { // Always log and record the failure before potentially returning early - record_upstream_failure(state, &UpstreamMetricContext::unrouted(), "route_rejected"); - tracing::warn!(model = %request.model, error = %error, "routing failed"); + let unrouted = UpstreamMetricContext::unrouted(); + record_upstream_failure(state, &unrouted, "route_rejected"); + emit_gateway_debug(state, "route_rejected", &request.model, &unrouted, Some("route_rejected")); + tracing::warn!(model = %normalize_model_label(Some(&request.model)), error = %error, "routing failed"); if let Some(previous_error) = last_error { let retryable = should_retry_buffered_upstream_error(&previous_error.0); @@ -394,6 +427,13 @@ async fn finalize_buffered_upstream_error( }; let outcome = classify_upstream_error(&error); + emit_gateway_debug( + state, + "upstream_attempt_terminal", + &request.model, + &metric_context, + Some(outcome), + ); record_upstream_retry_outcome( state, &metric_context, @@ -439,7 +479,8 @@ async fn proxy_buffered_attempt( ), > { let metric_context = UpstreamMetricContext::from_decision(decision); - tracing::info!(model = %request.model, account_id = %decision.account.id, "proxying chat completion"); + emit_gateway_debug(state, "upstream_attempt_start", &request.model, &metric_context, None); + tracing::info!(model = %normalize_model_label(Some(&request.model)), account_id = %decision.account.id, "proxying chat completion"); let permit = match state.upstream_concurrency.semaphore().acquire_owned().await { Ok(permit) => permit, @@ -459,8 +500,27 @@ async fn proxy_buffered_attempt( drop(permit); result - .map(|response| (response, metric_context.clone_static(), decision.account.id)) - .map_err(|error| (error, metric_context, decision.account.id)) + .map(|response| { + emit_gateway_debug( + state, + "upstream_attempt_success", + &request.model, + &metric_context, + Some("success"), + ); + (response, metric_context.clone_static(), decision.account.id) + }) + .map_err(|error| { + let outcome = classify_upstream_error(&error); + emit_gateway_debug( + state, + "upstream_attempt_failure", + &request.model, + &metric_context, + Some(outcome), + ); + (error, metric_context, decision.account.id) + }) } async fn handle_streaming_chat_completions( @@ -470,10 +530,16 @@ async fn handle_streaming_chat_completions( started_at: Instant, ) -> Response { let decision = match state.engine.resolve(&request.model).await { - Ok(decision) => decision, + Ok(decision) => { + let context = UpstreamMetricContext::from_decision(&decision); + emit_gateway_debug(state, "route_resolved", &request.model, &context, None); + decision + } Err(error) => { - record_upstream_failure(state, &UpstreamMetricContext::unrouted(), "route_rejected"); - tracing::warn!(model = %request.model, error = %error, "routing failed"); + let unrouted = UpstreamMetricContext::unrouted(); + record_upstream_failure(state, &unrouted, "route_rejected"); + emit_gateway_debug(state, "route_rejected", &request.model, &unrouted, Some("route_rejected")); + tracing::warn!(model = %normalize_model_label(Some(&request.model)), error = %error, "routing failed"); record_usage( state, UsageRecordInput { @@ -499,8 +565,16 @@ async fn handle_streaming_chat_completions( let metric_context = UpstreamMetricContext::from_decision(&decision); let account_id = decision.account.id; + emit_gateway_debug(state, "upstream_stream_start", &request.model, &metric_context, None); match upstream::open_chat_completion_stream(&state.client, &decision.account, body).await { Ok(upstream_response) => { + emit_gateway_debug( + state, + "upstream_stream_success", + &request.model, + &metric_context, + Some("success"), + ); state.registry.health().mark_success(account_id).await; let completion_state = state.clone(); let completion_input = UsageRecordInput { @@ -529,6 +603,13 @@ async fn handle_streaming_chat_completions( } Err(error) => { let outcome = classify_upstream_error(&error); + emit_gateway_debug( + state, + "upstream_stream_failure", + &request.model, + &metric_context, + Some(outcome), + ); record_upstream_failure(state, &metric_context, outcome); mark_account_failure(state, account_id).await; let response = map_upstream_error(error); @@ -723,6 +804,7 @@ mod tests { engine, client, observability: Arc::new(crate::observability::Observability::bootstrap()), + operational: crate::config::OperationalConfig::default(), resilience_policy, upstream_concurrency, }; @@ -1377,6 +1459,7 @@ mod tests { engine, client, observability: observability.clone(), + operational: crate::config::OperationalConfig::default(), resilience_policy, upstream_concurrency, }; @@ -1479,6 +1562,7 @@ mod tests { engine, client, observability: observability.clone(), + operational: crate::config::OperationalConfig::default(), resilience_policy, upstream_concurrency, }; @@ -1553,6 +1637,7 @@ mod tests { engine, client, observability: observability.clone(), + operational: crate::config::OperationalConfig::default(), resilience_policy, upstream_concurrency, }; @@ -1698,4 +1783,36 @@ mod tests { let json = serde_json::from_slice::(&body).unwrap(); assert_eq!(json, json!({"object":"list","data":[]})); } + + #[tokio::test] + async fn emit_gateway_debug_is_silent_when_debug_diagnostics_disabled() { + use crate::config::OperationalConfig; + + // test_state() builds a full GatewayState with default OperationalConfig + // (undercover=true, debug_diagnostics=false). + let (state, _registry) = test_state().await; + + // Confirm the default has debug_diagnostics = false. + assert!( + !state.operational.debug_diagnostics, + "default OperationalConfig should have debug_diagnostics = false" + ); + + // debug_diagnostics_enabled is the guard used by emit_gateway_debug. + assert!( + !super::debug_diagnostics_enabled(&state), + "debug_diagnostics_enabled should return false when debug_diagnostics = false" + ); + + // Confirm the inverse: when enabled, the guard returns true. + let mut enabled_state = state.clone(); + enabled_state.operational = OperationalConfig { + undercover: true, + debug_diagnostics: true, + }; + assert!( + super::debug_diagnostics_enabled(&enabled_state), + "debug_diagnostics_enabled should return true when debug_diagnostics = true" + ); + } } diff --git a/clients/rook/src/gateway/mod.rs b/clients/rook/src/gateway/mod.rs index cdf02420..4eb39bfe 100644 --- a/clients/rook/src/gateway/mod.rs +++ b/clients/rook/src/gateway/mod.rs @@ -22,6 +22,7 @@ use axum::{ Router, }; +use crate::config::OperationalConfig; use crate::observability::Observability; use crate::registry::RookRegistry; use crate::routing::RoutingEngine; @@ -71,6 +72,7 @@ pub struct GatewayState { pub engine: RoutingEngine, pub client: reqwest::Client, pub observability: Arc, + pub operational: OperationalConfig, pub resilience_policy: UpstreamResiliencePolicy, pub upstream_concurrency: UpstreamConcurrency, } diff --git a/clients/rook/src/main.rs b/clients/rook/src/main.rs index 4041bd0f..c2a2a535 100644 --- a/clients/rook/src/main.rs +++ b/clients/rook/src/main.rs @@ -632,6 +632,7 @@ mod tests { .expect("config export should serialize"); assert!(output.contains("\"host\": \"127.0.0.1\"")); + assert!(output.contains("\"redaction_baseline\": \"always_on\"")); assert!(output.contains("\"bearer_token\": \"[redacted]\"")); assert!(!output.contains("super-secret-token")); } diff --git a/clients/rook/src/server/mod.rs b/clients/rook/src/server/mod.rs index acd0a4d4..d39e9d3a 100644 --- a/clients/rook/src/server/mod.rs +++ b/clients/rook/src/server/mod.rs @@ -6,7 +6,8 @@ use crate::admin; use crate::auth::middleware::{admin_inbound_auth, gateway_inbound_auth}; use crate::config::{ - IdempotencyConfig, InboundAuthConfig, RateLimitConfig, RookConfig, TransportConfig, + IdempotencyConfig, InboundAuthConfig, OperationalConfig, RateLimitConfig, RookConfig, + TransportConfig, }; use crate::dashboard; use crate::domain::RookError; @@ -42,6 +43,8 @@ pub struct ServerConfig { pub enable_tui: bool, /// Path to the SQLite database file. Defaults to `"./rook.db"`. pub db_path: Option, + /// Operational safety and debugging controls. + pub operational: OperationalConfig, /// Inbound auth config for protected `/api/*` and `/v1/*` routes. pub inbound_auth: InboundAuthConfig, /// Transport middleware baseline config for `/api/*` and `/v1/*`. @@ -61,6 +64,7 @@ impl Default for ServerConfig { port: 4141, enable_tui: false, db_path: None, + operational: OperationalConfig::default(), inbound_auth: InboundAuthConfig::default(), transport: TransportConfig::default(), rate_limits: RateLimitConfig::default(), @@ -209,6 +213,7 @@ async fn build_app_with_registry_and_startup_state( engine, client, observability: observability.clone(), + operational: config.operational.clone(), resilience_policy, upstream_concurrency, }; @@ -249,6 +254,7 @@ async fn build_app_with_registry_and_startup_state( registry, startup: startup_state, observability, + operational: config.operational.clone(), }; let admin_router = admin::operational_router(admin_state.clone()) .layer(middleware::from_fn_with_state( @@ -499,6 +505,7 @@ mod tests { port: 4141, enable_tui: false, db_path: None, + operational: OperationalConfig::default(), inbound_auth: InboundAuthConfig { enabled: true, bearer_token: token.map(ToOwned::to_owned), @@ -789,6 +796,7 @@ mod tests { port: 8080, enable_tui: true, db_path: None, + operational: OperationalConfig::default(), inbound_auth: InboundAuthConfig::default(), transport: TransportConfig::default(), rate_limits: RateLimitConfig { @@ -825,6 +833,7 @@ mod tests { port: 0, enable_tui: true, db_path: None, + operational: OperationalConfig::default(), inbound_auth: InboundAuthConfig::default(), transport: TransportConfig::default(), rate_limits: RateLimitConfig::default(), @@ -2679,4 +2688,34 @@ mod tests { assert_eq!(asset_status, StatusCode::OK); assert!(String::from_utf8_lossy(&asset_body).contains("Corvus Rook")); } + + #[tokio::test] + async fn operational_config_propagates_to_admin_state() { + use crate::admin::AdminState; + use crate::config::OperationalConfig; + use crate::health::StartupDependencyState; + + let operational = OperationalConfig { + undercover: false, + debug_diagnostics: true, + }; + let registry = RookRegistry::open_in_memory().await.unwrap(); + let state = AdminState { + registry, + startup: Arc::new(StartupDependencyState { + config_ready: true, + database_ready: true, + router_ready: true, + assets_ready: true, + }), + observability: Arc::new(Observability::bootstrap()), + operational: operational.clone(), + }; + + assert_eq!(state.operational.undercover, operational.undercover); + assert_eq!( + state.operational.debug_diagnostics, + operational.debug_diagnostics + ); + } } diff --git a/openspec/specs/gateway/spec.md b/openspec/specs/gateway/spec.md index 6c399288..48eb6eb0 100644 --- a/openspec/specs/gateway/spec.md +++ b/openspec/specs/gateway/spec.md @@ -4118,6 +4118,183 @@ The metrics surface MUST be observable without requiring operator access to appl --- +### Requirement: Operational Undercover and Redaction Boundary + +The gateway domain MUST define an explicit undercover/redaction posture for production operation. The +posture MUST protect secrets and sensitive user/runtime payloads at every boundary where gateway or +agent execution state can leave the trusted runtime. + +Undercover mode MUST be fail-closed and conservative. When enabled, the system MUST redact or omit at +minimum: + +- provider API keys, bearer tokens, webhook secrets, bridge/session JWTs, and any configured + credential values +- raw user prompts, raw tool input/output payloads, raw media bytes, base64 payloads, and channel + media URLs +- local filesystem paths, connection strings, environment-derived secret values, and provider error + bodies that may contain request payloads + +The enforcement points MUST include gateway request/response diagnostics, agent-loop events, provider +request/response logs, tool execution logs, coordination/mailbox diagnostics, admin API responses, +metrics labels, and startup/doctor output. Metrics MAY expose bounded metadata such as route name, +provider identifier, model identifier, outcome class, duration, and token/cost totals, but MUST NOT +place sensitive payloads in labels or samples. + +Undercover mode MUST NOT weaken operational observability. The system SHOULD preserve correlation IDs, +session identifiers after safe normalization, event names, durations, counts, outcome classes, and +structured failure categories so operators can debug behavior without exposing protected content. + +#### Scenario: undercover mode redacts agent execution diagnostics + +- GIVEN undercover mode is enabled +- AND an agent turn includes a user prompt, tool output, provider token, and local path +- WHEN gateway, provider, tool, or agent-loop diagnostics are emitted +- THEN diagnostics MUST omit or redact the raw prompt, tool output, token, and local path +- AND diagnostics MUST still include safe metadata such as event kind, outcome class, duration, and + normalized session correlation + +#### Scenario: admin and metrics surfaces do not leak protected values + +- GIVEN undercover mode is enabled +- WHEN an operator reads admin configuration, usage, health, or metrics surfaces +- THEN credential values and raw payloads MUST NOT be returned +- AND metrics labels MUST use low-cardinality non-sensitive dimensions only + +#### Scenario: unsupported redaction boundary fails closed + +- GIVEN a runtime path cannot guarantee redaction for a diagnostic payload +- WHEN undercover mode is enabled +- THEN the system MUST suppress or replace that payload with a structured redaction marker +- AND it MUST NOT emit the raw payload as a best-effort fallback + +--- + +### Requirement: Stable Multi-Provider Operational Controls + +The gateway domain MUST expose stable runtime and user-facing paths for multi-provider operation so +operators can configure and inspect provider behavior without depending on internal implementation +details. + +The stable control paths MUST cover: + +- default provider and model selection +- model routes that map user-facing hints or logical models to provider/model targets +- provider account pools and routing health when account pooling is enabled +- per-session provider/model overrides through the shared slash-command contract where supported +- operator diagnostics that explain routing decisions using redacted, non-secret metadata + +Multi-provider controls MUST preserve account and credential isolation. Provider account pool members +MUST remain isolated from one another, credentials MUST be redacted in user-facing and operator-facing +responses, and provider fallback MUST NOT silently change a request's security boundary or capability +contract. When a requested provider/model/capability cannot be served, the system MUST return a +structured unavailable or unsupported outcome rather than silently downgrading to an unrelated route. + +#### Scenario: operator inspects provider routing safely + +- GIVEN multiple providers, model routes, or account pools are configured +- WHEN an operator inspects effective routing through stable runtime or admin surfaces +- THEN the response MUST identify configured provider/model targets and route health using redacted + metadata +- AND the response MUST NOT expose provider credentials or raw upstream request bodies + +#### Scenario: session provider override uses stable command path + +- GIVEN a session supports shared slash-command handling +- WHEN a caller invokes the stable provider/model command path for that session +- THEN the runtime MUST apply or report the active provider/model through the shared command contract +- AND unsupported providers, unknown models, or unavailable capabilities MUST be reported as + structured command/runtime failures + +#### Scenario: fallback preserves capability and security boundaries + +- GIVEN a primary provider route fails or becomes unhealthy +- WHEN the runtime considers fallback to another provider or account +- THEN fallback MUST only select a target that satisfies the same required capability and security + constraints +- AND the runtime MUST NOT silently strip required content, credentials scope, or safety controls to + make a fallback succeed + +--- + +### Requirement: Agent Execution Debugging and Logging Workflow + +The system MUST provide first-class debugging and logging workflows tied to agent execution across the +gateway, agent loop, tools, and coordination surfaces. + +The debugging workflow MUST expose enough structured information to reconstruct an execution story +without reading raw sensitive payloads. At minimum, safe diagnostics MUST cover: + +- request admission and gateway transport outcome +- selected provider/model/route and fallback outcome +- agent-loop lifecycle events including start, completion, failure, and blocked/denied outcomes +- tool call start/end, duration, and success/failure category +- coordinator child admission, fan-out/fan-in, cancellation, and terminal outcomes +- remote bridge admission/transport outcome when remote sessions are involved + +Debug output MUST be controllable by explicit operator configuration or logging level and MUST inherit +the undercover/redaction boundary. Debug workflows MAY include logs, metrics, traces, doctor checks, +or admin diagnostic responses, but all such surfaces MUST use the same redaction posture. + +#### Scenario: agent turn can be debugged end-to-end + +- GIVEN debug diagnostics are enabled for a gateway-backed agent turn +- WHEN the turn completes, fails, or is blocked +- THEN an operator MUST be able to follow admission, routing, agent-loop, tool, and terminal outcome + events through safe correlation metadata +- AND raw prompts, tool payloads, credentials, and provider request/response bodies MUST remain + redacted or omitted + +#### Scenario: coordination diagnostics preserve child lifecycle + +- GIVEN a coordinator fans work out to one or more child agents +- WHEN debug diagnostics are enabled +- THEN diagnostics MUST expose child admission, transport kind, lifecycle transition, cancellation, + and terminal aggregate outcome +- AND mailbox or bridge diagnostics MUST NOT expose raw child prompts, tool payloads, or credentials + +#### Scenario: debugging controls document security tradeoffs + +- GIVEN an operator enables verbose debugging +- WHEN the system emits additional diagnostic detail +- THEN the documentation and runtime behavior MUST make clear that verbose mode increases metadata + exposure +- AND undercover/redaction controls MUST still prevent protected payload and secret disclosure + +--- + +### Requirement: Operational Parity Rollout Boundaries + +Operational parity for undercover mode, multi-provider controls, and debugging workflows MUST be +rolled out with explicit boundaries and security notes. + +The rollout documentation MUST describe: + +- which ingress surfaces enforce undercover/redaction behavior +- which provider control paths are stable and which are deferred +- which debug/logging surfaces are safe for production use +- how coordination and remote-session diagnostics relate to the multi-agent orchestration and bridge + remote-session specifications +- rollback guidance for disabling verbose diagnostics without disabling safety redaction + +The rollout MUST treat security redaction as a safety control, not a debug feature. Disabling verbose +logging MUST reduce diagnostic volume, but MUST NOT disable baseline secret redaction. + +#### Scenario: rollout docs identify supported and deferred surfaces + +- GIVEN an operator reads the rollout documentation for operational parity +- WHEN they configure undercover mode, providers, or debug diagnostics +- THEN the documentation MUST identify supported ingress and admin surfaces +- AND deferred or unsupported surfaces MUST be explicitly named rather than implied as supported + +#### Scenario: disabling debug does not disable redaction + +- GIVEN an operator disables verbose debugging +- WHEN the runtime continues normal operation +- THEN baseline secret and payload redaction MUST remain active +- AND protected values MUST NOT be emitted simply because debug diagnostics are off + +--- + ### Requirement: Linux Release Binary Startup Smoke Validation The release workflow for the Linux Cerebro binary MUST execute a startup smoke validation that proves the built release artifact can start the real HTTP and MCP service surface, not merely parse CLI arguments or print help text. From 6daf45cc6d291cf5474fa014e62346c305c13425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuniel=20Acosta=20P=C3=A9rez?= <33158051+yacosta738@users.noreply.github.com> Date: Wed, 6 May 2026 14:36:11 +0200 Subject: [PATCH 2/2] fix(rook): address review findings from PR #792 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config: fix OperationalConfig default – undercover now false (was true) - config: redact db_path in export view when undercover mode is active - admin: remove undercover from OperationalStatusView – no enforcement exists; expose only debug_diagnostics and redaction_baseline in /status - gateway: rename upstream_stream_success → upstream_stream_opened at stream open site; success/failure semantics belong at completion, not open - gateway: rename guard-only test to debug_diagnostics_enabled_guard_returns_correct_value; fix stale comment that referenced undercover=true default - server: rewrite operational_config_propagates_to_admin_state to exercise the real ServerConfig → build_app_with_registry → /api/status path - config: update test name and assertion to match new undercover=false default --- clients/rook/src/admin/handlers.rs | 1 - clients/rook/src/admin/types.rs | 1 - clients/rook/src/config/mod.rs | 12 ++++++--- clients/rook/src/gateway/handlers.rs | 8 +++--- clients/rook/src/server/mod.rs | 38 +++++++++++++--------------- 5 files changed, 29 insertions(+), 31 deletions(-) diff --git a/clients/rook/src/admin/handlers.rs b/clients/rook/src/admin/handlers.rs index 8b6c35db..232e7cbd 100644 --- a/clients/rook/src/admin/handlers.rs +++ b/clients/rook/src/admin/handlers.rs @@ -277,7 +277,6 @@ pub async fn handle_operator_status( usage_accounting_enabled: true, }, operational: OperationalStatusView { - undercover: state.operational.undercover, debug_diagnostics: state.operational.debug_diagnostics, redaction_baseline: "always_on", }, diff --git a/clients/rook/src/admin/types.rs b/clients/rook/src/admin/types.rs index 1c4bc685..1b43a6f7 100644 --- a/clients/rook/src/admin/types.rs +++ b/clients/rook/src/admin/types.rs @@ -162,7 +162,6 @@ pub struct OperatorRuntimeView { #[derive(Debug, Clone, PartialEq, Eq, Serialize)] pub struct OperationalStatusView { - pub undercover: bool, pub debug_diagnostics: bool, pub redaction_baseline: &'static str, } diff --git a/clients/rook/src/config/mod.rs b/clients/rook/src/config/mod.rs index 8053296b..dafa1e64 100644 --- a/clients/rook/src/config/mod.rs +++ b/clients/rook/src/config/mod.rs @@ -982,7 +982,11 @@ impl RookConfigExportView { host: config.host.clone(), port: config.port, enable_tui: config.enable_tui, - db_path: config.db_path.display().to_string(), + db_path: if config.operational.undercover { + "".to_string() + } else { + config.db_path.display().to_string() + }, operational: OperationalExportView::from(&config.operational), inbound_auth: InboundAuthExportView { enabled: config.inbound_auth.enabled, @@ -1061,7 +1065,7 @@ pub struct OperationalConfig { impl Default for OperationalConfig { fn default() -> Self { Self { - undercover: true, + undercover: false, debug_diagnostics: false, } } @@ -1537,10 +1541,10 @@ mod tests { } #[test] - fn operational_config_defaults_to_undercover_enabled_and_debug_disabled() { + fn operational_config_defaults_to_undercover_disabled_and_debug_disabled() { let config = super::RookConfig::default(); - assert!(config.operational.undercover); + assert!(!config.operational.undercover); assert!(!config.operational.debug_diagnostics); } diff --git a/clients/rook/src/gateway/handlers.rs b/clients/rook/src/gateway/handlers.rs index b97fcfdb..e1c5bf99 100644 --- a/clients/rook/src/gateway/handlers.rs +++ b/clients/rook/src/gateway/handlers.rs @@ -570,7 +570,7 @@ async fn handle_streaming_chat_completions( Ok(upstream_response) => { emit_gateway_debug( state, - "upstream_stream_success", + "upstream_stream_opened", &request.model, &metric_context, Some("success"), @@ -1785,11 +1785,11 @@ mod tests { } #[tokio::test] - async fn emit_gateway_debug_is_silent_when_debug_diagnostics_disabled() { + async fn debug_diagnostics_enabled_guard_returns_correct_value() { use crate::config::OperationalConfig; // test_state() builds a full GatewayState with default OperationalConfig - // (undercover=true, debug_diagnostics=false). + // (undercover=false, debug_diagnostics=false). let (state, _registry) = test_state().await; // Confirm the default has debug_diagnostics = false. @@ -1807,7 +1807,7 @@ mod tests { // Confirm the inverse: when enabled, the guard returns true. let mut enabled_state = state.clone(); enabled_state.operational = OperationalConfig { - undercover: true, + undercover: false, debug_diagnostics: true, }; assert!( diff --git a/clients/rook/src/server/mod.rs b/clients/rook/src/server/mod.rs index d39e9d3a..1f16162e 100644 --- a/clients/rook/src/server/mod.rs +++ b/clients/rook/src/server/mod.rs @@ -2691,31 +2691,27 @@ mod tests { #[tokio::test] async fn operational_config_propagates_to_admin_state() { - use crate::admin::AdminState; - use crate::config::OperationalConfig; - use crate::health::StartupDependencyState; - - let operational = OperationalConfig { - undercover: false, - debug_diagnostics: true, + let config = ServerConfig { + operational: OperationalConfig { + undercover: false, + debug_diagnostics: true, + }, + ..ServerConfig::default() }; let registry = RookRegistry::open_in_memory().await.unwrap(); - let state = AdminState { - registry, - startup: Arc::new(StartupDependencyState { - config_ready: true, - database_ready: true, - router_ready: true, - assets_ready: true, - }), - observability: Arc::new(Observability::bootstrap()), - operational: operational.clone(), - }; + let app = build_app_with_registry(config, registry).await.unwrap(); - assert_eq!(state.operational.undercover, operational.undercover); + let (status, json) = request_json(app, "/api/status").await; + assert_eq!(status, StatusCode::OK); + + let operational = &json["operational"]; + assert_eq!( + operational["debug_diagnostics"], true, + "debug_diagnostics should propagate to /api/status response" + ); assert_eq!( - state.operational.debug_diagnostics, - operational.debug_diagnostics + operational["redaction_baseline"], "always_on", + "redaction_baseline should always be always_on" ); } }