Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/hardening-wireframe-a-guide-to-production-resilience.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,14 @@ Initialization) pattern for safety.
Connection tasks are wrapped with `catch_unwind` to log and discard panics.
Each panicking connection is isolated so it cannot terminate the entire server.

Each occurrence also increments the `wireframe_connection_panics_total`
Comment thread
leynos marked this conversation as resolved.
counter, enabling alerts on unexpected spikes. The counter intentionally omits
Comment thread
leynos marked this conversation as resolved.
peer address labels to limit cardinality and protect personally identifiable
Comment thread
leynos marked this conversation as resolved.
information. Operators can chart `rate(wireframe_connection_panics_total[5m])`
Comment thread
leynos marked this conversation as resolved.
in Prometheus and create Grafana panels to visualize instability. To emit this
Comment thread
leynos marked this conversation as resolved.
metric, enable the `metrics` Cargo feature and install a recorder such as
Comment thread
leynos marked this conversation as resolved.
`metrics_exporter_prometheus`, which exposes an HTTP endpoint for scraping.
Comment thread
leynos marked this conversation as resolved.

Comment thread
leynos marked this conversation as resolved.
### 3.2 Leak-Proof Registries with `Weak`/`Arc`

A global `SessionRegistry` that stores `PushHandle`s to active connections is a
Expand Down
30 changes: 30 additions & 0 deletions src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ pub const CONNECTIONS_ACTIVE: &str = "wireframe_connections_active";
pub const FRAMES_PROCESSED: &str = "wireframe_frames_processed_total";
/// Name of the counter tracking error occurrences.
pub const ERRORS_TOTAL: &str = "wireframe_errors_total";
/// Name of the counter tracking connection panics.
///
/// ```text
/// # HELP wireframe_connection_panics_total Count of panicking connection tasks.
/// # TYPE wireframe_connection_panics_total counter
/// wireframe_connection_panics_total 1
/// ```
pub const CONNECTION_PANICS: &str = "wireframe_connection_panics_total";

/// Direction of frame processing.
#[derive(Clone, Copy)]
Expand Down Expand Up @@ -79,3 +87,25 @@ pub fn inc_handler_errors() { counter!(ERRORS_TOTAL, "kind" => "handler").increm

#[cfg(not(feature = "metrics"))]
pub fn inc_handler_errors() {}

/// Record a panicking connection task.
///
/// # Examples
///
/// ```no_run
/// use std::panic::catch_unwind;
///
/// use wireframe::metrics;
///
/// let res = catch_unwind(|| {
/// panic!("boom");
/// });
/// if res.is_err() {
/// metrics::inc_connection_panics();
/// }
/// ```
#[cfg(feature = "metrics")]
pub fn inc_connection_panics() { counter!(CONNECTION_PANICS).increment(1); }

#[cfg(not(feature = "metrics"))]
pub fn inc_connection_panics() {}
48 changes: 48 additions & 0 deletions src/server/connection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ pub(super) fn spawn_connection_task<F, T>(
.catch_unwind();

if let Err(panic) = fut.await {
crate::metrics::inc_connection_panics();
let panic_msg = panic
.downcast_ref::<&str>()
.copied()
Expand Down Expand Up @@ -85,6 +86,7 @@ async fn process_stream<F, T>(

#[cfg(test)]
mod tests {
use metrics_util::debugging::{DebugValue, DebuggingRecorder};
use rstest::rstest;
use tokio::{
net::{TcpListener, TcpStream},
Expand Down Expand Up @@ -210,4 +212,50 @@ mod tests {
.ok_or_else(|| "panic log not found".to_string())
});
}

/// Panics increment the connection panic counter.
#[rstest]
#[tokio::test]
async fn connection_panic_metric_increments(
Comment thread
leynos marked this conversation as resolved.
factory: impl Fn() -> WireframeApp + Send + Sync + Clone + 'static,
) {
let recorder = DebuggingRecorder::new();
let snapshotter = recorder.snapshotter();
recorder.install().expect("recorder install");

let app_factory = move || {
factory()
.on_connection_setup(|| async { panic!("boom") })
.unwrap()
};
let tracker = TaskTracker::new();
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();

let task = tokio::spawn({
let tracker = tracker.clone();
let app_factory = app_factory;
async move {
let (stream, _) = listener.accept().await.unwrap();
spawn_connection_task::<_, ()>(stream, app_factory, None, None, &tracker);
tracker.close();
tracker.wait().await;
}
});

let client = TcpStream::connect(addr).await.unwrap();
client.writable().await.unwrap();
client.try_write(&[0; 8]).unwrap();
drop(client);

task.await.unwrap();
tokio::task::yield_now().await;

let metrics = snapshotter.snapshot().into_vec();
let found = metrics.iter().any(|(k, _, _, v)| {
k.key().name() == crate::metrics::CONNECTION_PANICS
&& matches!(v, DebugValue::Counter(c) if *c > 0)
});
assert!(found, "connection panic metric not recorded");
}
}
20 changes: 20 additions & 0 deletions tests/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,23 @@ fn error_metric_increments() {
});
assert!(found, "error metric not recorded");
}

#[test]
fn connection_panic_metric_increments() {
let (snapshotter, recorder) = debugging_recorder_setup();
metrics::with_local_recorder(&recorder, || {
wireframe::metrics::inc_connection_panics();
});

let metrics = snapshotter.snapshot().into_vec();
let count = metrics
.iter()
.find_map(|(k, _, _, v)| {
(k.key().name() == wireframe::metrics::CONNECTION_PANICS).then_some(match v {
DebugValue::Counter(c) => *c,
_ => 0,
})
})
.unwrap_or(0);
assert_eq!(1, count, "panic metric not recorded");
}
Loading