diff --git a/docs/hardening-wireframe-a-guide-to-production-resilience.md b/docs/hardening-wireframe-a-guide-to-production-resilience.md index 9b9acc46..7f41c0f4 100644 --- a/docs/hardening-wireframe-a-guide-to-production-resilience.md +++ b/docs/hardening-wireframe-a-guide-to-production-resilience.md @@ -165,6 +165,14 @@ Initialization) pattern for safety. Connection tasks are wrapped with `catch_unwind` to log and discard panics. Each panicking connection is isolated so it cannot terminate the entire server. +Each occurrence also increments the `wireframe_connection_panics_total` +counter, enabling alerts on unexpected spikes. The counter intentionally omits +peer address labels to limit cardinality and protect personally identifiable +information. Operators can chart `rate(wireframe_connection_panics_total[5m])` +in Prometheus and create Grafana panels to visualize instability. To emit this +metric, enable the `metrics` Cargo feature and install a recorder such as +`metrics_exporter_prometheus`, which exposes an HTTP endpoint for scraping. + ### 3.2 Leak-Proof Registries with `Weak`/`Arc` A global `SessionRegistry` that stores `PushHandle`s to active connections is a diff --git a/src/metrics.rs b/src/metrics.rs index e756f8d1..6c688d92 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -24,6 +24,14 @@ pub const CONNECTIONS_ACTIVE: &str = "wireframe_connections_active"; pub const FRAMES_PROCESSED: &str = "wireframe_frames_processed_total"; /// Name of the counter tracking error occurrences. pub const ERRORS_TOTAL: &str = "wireframe_errors_total"; +/// Name of the counter tracking connection panics. +/// +/// ```text +/// # HELP wireframe_connection_panics_total Count of panicking connection tasks. +/// # TYPE wireframe_connection_panics_total counter +/// wireframe_connection_panics_total 1 +/// ``` +pub const CONNECTION_PANICS: &str = "wireframe_connection_panics_total"; /// Direction of frame processing. #[derive(Clone, Copy)] @@ -79,3 +87,25 @@ pub fn inc_handler_errors() { counter!(ERRORS_TOTAL, "kind" => "handler").increm #[cfg(not(feature = "metrics"))] pub fn inc_handler_errors() {} + +/// Record a panicking connection task. +/// +/// # Examples +/// +/// ```no_run +/// use std::panic::catch_unwind; +/// +/// use wireframe::metrics; +/// +/// let res = catch_unwind(|| { +/// panic!("boom"); +/// }); +/// if res.is_err() { +/// metrics::inc_connection_panics(); +/// } +/// ``` +#[cfg(feature = "metrics")] +pub fn inc_connection_panics() { counter!(CONNECTION_PANICS).increment(1); } + +#[cfg(not(feature = "metrics"))] +pub fn inc_connection_panics() {} diff --git a/src/server/connection.rs b/src/server/connection.rs index 95def829..503640d8 100644 --- a/src/server/connection.rs +++ b/src/server/connection.rs @@ -38,6 +38,7 @@ pub(super) fn spawn_connection_task( .catch_unwind(); if let Err(panic) = fut.await { + crate::metrics::inc_connection_panics(); let panic_msg = panic .downcast_ref::<&str>() .copied() @@ -85,6 +86,7 @@ async fn process_stream( #[cfg(test)] mod tests { + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; use rstest::rstest; use tokio::{ net::{TcpListener, TcpStream}, @@ -210,4 +212,50 @@ mod tests { .ok_or_else(|| "panic log not found".to_string()) }); } + + /// Panics increment the connection panic counter. + #[rstest] + #[tokio::test] + async fn connection_panic_metric_increments( + factory: impl Fn() -> WireframeApp + Send + Sync + Clone + 'static, + ) { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + recorder.install().expect("recorder install"); + + let app_factory = move || { + factory() + .on_connection_setup(|| async { panic!("boom") }) + .unwrap() + }; + let tracker = TaskTracker::new(); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + let task = tokio::spawn({ + let tracker = tracker.clone(); + let app_factory = app_factory; + async move { + let (stream, _) = listener.accept().await.unwrap(); + spawn_connection_task::<_, ()>(stream, app_factory, None, None, &tracker); + tracker.close(); + tracker.wait().await; + } + }); + + let client = TcpStream::connect(addr).await.unwrap(); + client.writable().await.unwrap(); + client.try_write(&[0; 8]).unwrap(); + drop(client); + + task.await.unwrap(); + tokio::task::yield_now().await; + + let metrics = snapshotter.snapshot().into_vec(); + let found = metrics.iter().any(|(k, _, _, v)| { + k.key().name() == crate::metrics::CONNECTION_PANICS + && matches!(v, DebugValue::Counter(c) if *c > 0) + }); + assert!(found, "connection panic metric not recorded"); + } } diff --git a/tests/metrics.rs b/tests/metrics.rs index c246cff2..684f0bce 100644 --- a/tests/metrics.rs +++ b/tests/metrics.rs @@ -61,3 +61,23 @@ fn error_metric_increments() { }); assert!(found, "error metric not recorded"); } + +#[test] +fn connection_panic_metric_increments() { + let (snapshotter, recorder) = debugging_recorder_setup(); + metrics::with_local_recorder(&recorder, || { + wireframe::metrics::inc_connection_panics(); + }); + + let metrics = snapshotter.snapshot().into_vec(); + let count = metrics + .iter() + .find_map(|(k, _, _, v)| { + (k.key().name() == wireframe::metrics::CONNECTION_PANICS).then_some(match v { + DebugValue::Counter(c) => *c, + _ => 0, + }) + }) + .unwrap_or(0); + assert_eq!(1, count, "panic metric not recorded"); +}