From 43a0530f0b2a6095807bcd8cffe4233df564a76d Mon Sep 17 00:00:00 2001 From: obchain Date: Wed, 22 Apr 2026 17:09:05 +0530 Subject: [PATCH 01/11] feat(grafana): dashboard JSON importable into Grafana / Grafana Cloud MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `deploy/grafana/charon.json` — one-click-importable dashboard covering every metric the `charon-metrics` exporter actually emits. Panels: - Scanner: blocks / sec per chain - Pipeline: block-latency p50 and p95 (from the histogram) - Scanner: live position counts stacked by bucket (green/yellow/red for healthy/near_liq/liquidatable) - Executor: queue depth stat + cumulative profit stat (USD) - Executor: simulations / min stacked by result (ok/revert/error) - Executor: opportunities queued vs dropped (per drop stage) - Executor: per-opportunity profit distribution as a heatmap - Build info: version + git_sha in a table Templating: - `$datasource` — Prometheus data source picker - `$chain` — auto-populated from `charon_scanner_blocks_total` labels - `$instance` — auto-populated from `charon_build_info` labels Aspirational panels from #49 body that are NOT included yet because the exporter doesn't emit the underlying series: - Mempool txs / min, impacted positions flagged - Gas (base fee, priority fee, tx cost in cents) - RPC latency p50/p95, error rate per endpoint These will be added as follow-up PRs alongside the metrics that feed them; shipping the panels blank would only clutter the dashboard. Dashboard UID `charon-v0` and stable tags mean re-importing replaces rather than duplicates the dashboard in Grafana. README gains a three-step import section pointing at the new file. Closes #49. --- README.md | 10 + deploy/grafana/charon.json | 470 +++++++++++++++++++++++++++++++++++++ 2 files changed, 480 insertions(+) create mode 100644 deploy/grafana/charon.json diff --git a/README.md b/README.md index 8984a0c..cc4b6eb 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,16 @@ Key series (full list: [`crates/charon-metrics/src/lib.rs`](crates/charon-metric The exporter binds `:9091` (not `:9090`) so it doesn't collide with a co-located Prometheus server. +### Grafana dashboard + +A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json). Three steps to load it into Grafana or Grafana Cloud: + +1. Add a Prometheus data source that scrapes `http://:9091/metrics` (every ~10 s is fine). +2. In Grafana, **Dashboards → New → Import → Upload JSON file** and pick the file above. +3. On the import screen, select the Prometheus data source you created and click **Import**. + +Dashboard UID is `charon-v0` and tags are `charon`, `liquidation`, `defi` — re-importing over an existing copy replaces it rather than duplicating. Variables (`Chain`, `Instance`) auto-populate from label values once metrics start flowing. + ## Repository layout ``` diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json new file mode 100644 index 0000000..6d6e803 --- /dev/null +++ b/deploy/grafana/charon.json @@ -0,0 +1,470 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Rate of blocks drained from the chain listener, per chain.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "blocks / sec", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (chain) (rate(charon_scanner_blocks_total{instance=~\"$instance\",chain=~\"$chain\"}[1m]))", + "legendFormat": "{{chain}}", + "range": true, + "refId": "A" + } + ], + "title": "Scanner — blocks per second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Per-block pipeline wall-clock latency. p50 and p95 from the histogram.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "seconds", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le, chain) (rate(charon_pipeline_block_duration_seconds_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m])))", + "legendFormat": "p50 {{chain}}", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, chain) (rate(charon_pipeline_block_duration_seconds_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m])))", + "legendFormat": "p95 {{chain}}", + "range": true, + "refId": "B" + } + ], + "title": "Pipeline — block latency p50/p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Currently tracked positions bucketed by health classification.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "positions", + "drawStyle": "line", + "fillOpacity": 60, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "healthy" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }] }, + { "matcher": { "id": "byName", "options": "near_liq" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }] }, + { "matcher": { "id": "byName", "options": "liquidatable" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (bucket) (charon_scanner_positions{instance=~\"$instance\",chain=~\"$chain\"})", + "legendFormat": "{{bucket}}", + "range": true, + "refId": "A" + } + ], + "title": "Scanner — positions by bucket", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Current depth of the profit-ordered opportunity queue.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 10 }, + { "color": "red", "value": 50 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 8 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "charon_executor_queue_depth{instance=~\"$instance\"}", + "legendFormat": "queue depth", + "range": true, + "refId": "A" + } + ], + "title": "Queue depth", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Cumulative profit across all opportunities that cleared the profit gate (USD).", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\"}) / 100", + "legendFormat": "total profit", + "range": true, + "refId": "A" + } + ], + "title": "Cumulative profit", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Simulation outcomes per minute, partitioned by result.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "sim / min", + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "ok" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }] }, + { "matcher": { "id": "byName", "options": "revert" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 6, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (result) (rate(charon_executor_simulations_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", + "legendFormat": "{{result}}", + "range": true, + "refId": "A" + } + ], + "title": "Executor — simulations per minute", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Opportunity outcomes: queued (passed every gate) vs dropped (and at which stage).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "opps / min", + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 7, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(charon_executor_opportunities_queued_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", + "legendFormat": "queued", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (stage) (rate(charon_executor_opportunities_dropped_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", + "legendFormat": "dropped — {{stage}}", + "range": true, + "refId": "B" + } + ], + "title": "Executor — opportunities queued vs dropped", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Distribution of per-opportunity net profit (USD cents). Heat-map uses the underlying histogram buckets.", + "fieldConfig": { + "defaults": { + "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "scaleDistribution": { "type": "linear" } } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 8, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "legend": { "show": true }, + "rowsFrame": { "layout": "auto" }, + "tooltip": { "mode": "single", "showColorScale": false, "yHistogram": false }, + "yAxis": { "axisPlacement": "left", "reverse": false, "unit": "short" } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (le) (rate(charon_executor_profit_usd_cents_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Executor — per-opportunity profit distribution (cents)", + "type": "heatmap" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Running build metadata. `version` and `git_sha` come from the Charon process at startup.", + "fieldConfig": { + "defaults": { "custom": { "align": "auto" } }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "charon_build_info{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Build info", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "Value": true, "__name__": true, "job": true }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["charon", "liquidation", "defi"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(charon_scanner_blocks_total, chain)", + "hide": 0, + "includeAll": true, + "label": "Chain", + "multi": true, + "name": "chain", + "options": [], + "query": { "query": "label_values(charon_scanner_blocks_total, chain)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(charon_build_info, instance)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { "query": "label_values(charon_build_info, instance)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-15m", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Charon Bot", + "uid": "charon-v0", + "version": 1, + "weekStart": "" +} From 02bbd10486d27f33081cc0095fd41759bffe149a Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 15:58:15 +0530 Subject: [PATCH 02/11] fix(grafana): document BSC 3s block-time context on latency panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The p50/p95 block-duration quantiles already use a [5m] range, giving ~100 observations at BSC's 3s cadence — enough to keep the estimate stable between scrapes. Extend the panel description so the rationale is visible to operators reading the dashboard. Closes #279 --- deploy/grafana/charon.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index 6d6e803..662de1d 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -64,7 +64,7 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "description": "Per-block pipeline wall-clock latency. p50 and p95 from the histogram.", + "description": "Per-block pipeline wall-clock latency. p50 and p95 from the histogram. BSC produces a block every ~3s; quantiles use a [5m] range (~100 observations) so they stay stable across scrapes.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, From 257600f50bb9948e70399dc2ba27c45b2a87d775 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 15:58:52 +0530 Subject: [PATCH 03/11] fix(grafana): use increase() over dashboard range for profit panel The cumulative-profit panel previously queried the histogram _sum accumulator directly and divided by 100. _sum resets on process restart, so a mid-window redeploy rendered as a sharp step-down indistinguishable from a real loss. Switch to increase(..._sum[$__range]) / 100 so the counter-reset semantics of increase() absorb restarts and the window tracks the dashboard time picker. Title and description updated to match. Closes #276 --- deploy/grafana/charon.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index 662de1d..58b1e3b 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -193,7 +193,7 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "description": "Cumulative profit across all opportunities that cleared the profit gate (USD).", + "description": "Profit across all opportunities that cleared the profit gate (USD), summed over the selected dashboard time range. Uses increase() over the histogram _sum accumulator so process restarts do not appear as negative steps; the window follows the top-right time picker.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -223,13 +223,13 @@ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\"}) / 100", - "legendFormat": "total profit", + "expr": "sum(increase(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\"}[$__range])) / 100", + "legendFormat": "profit (selected range)", "range": true, "refId": "A" } ], - "title": "Cumulative profit", + "title": "Profit (selected range)", "type": "stat" }, { From 17761658cfcd8023ced8ee4f2c3b9ab11e0ec740 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 15:59:40 +0530 Subject: [PATCH 04/11] fix(grafana): align funnel series on stage label via label_replace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The opportunity funnel panel aggregated queued as a single total while dropped was broken out by stage. The two series rendered on incomparable axes: one line vs four, no way to read stage-level loss rate against intake. Wrap the queued query in label_replace(..., "stage", "queued") so it joins the dropped series under one stage-partitioned legend. Colour overrides stay inert — Grafana picks from the classic palette for the five stage labels. Closes #280 --- deploy/grafana/charon.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index 58b1e3b..94872f9 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -275,7 +275,7 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "description": "Opportunity outcomes: queued (passed every gate) vs dropped (and at which stage).", + "description": "Opportunity funnel, per minute, grouped by stage. `queued` counts entries that cleared the profit gate and landed in the queue (the #220 `simulated` label is rolled up here); `router`, `profit`, `simulation`, and `build` are drop stages. Both series share the same `stage` axis so intake and losses are directly comparable.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -301,8 +301,8 @@ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(rate(charon_executor_opportunities_queued_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", - "legendFormat": "queued", + "expr": "label_replace(sum(rate(charon_executor_opportunities_queued_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60, \"stage\", \"queued\", \"\", \"\")", + "legendFormat": "{{stage}}", "range": true, "refId": "A" }, @@ -310,7 +310,7 @@ "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (stage) (rate(charon_executor_opportunities_dropped_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", - "legendFormat": "dropped — {{stage}}", + "legendFormat": "{{stage}}", "range": true, "refId": "B" } From a1e785f41e667f55b4469aa841c0f755e073cae1 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 16:01:28 +0530 Subject: [PATCH 05/11] fix(grafana): default chain and instance variables to All (.*) Template variables $chain and $instance resolve via label_values, which returns empty until the bot is scraping. Without an explicit default, fresh imports rendered every panel as No Data. Set allValue='.*' on both, mark the current selection as All, and add a short description on each variable plus the dashboard so operators see that the All-default exists by design. Panels now resolve on first import and auto-refine once labels populate. Closes #282 --- deploy/grafana/charon.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index 94872f9..d21b691 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -12,7 +12,7 @@ } ] }, - "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091).", + "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091). Chain/Instance variables default to All (.*) so panels render before the first scrape populates label_values; they auto-refine once metrics flow.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -425,9 +425,11 @@ "type": "datasource" }, { - "current": { "selected": false, "text": "All", "value": "$__all" }, + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "definition": "label_values(charon_scanner_blocks_total, chain)", + "description": "Chain label. Defaults to All (.*) so panels render data even before label_values is populated (fresh import or cold start).", "hide": 0, "includeAll": true, "label": "Chain", @@ -442,9 +444,11 @@ "type": "query" }, { - "current": { "selected": false, "text": "All", "value": "$__all" }, + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "definition": "label_values(charon_build_info, instance)", + "description": "Instance label (Prometheus scrape target). Defaults to All (.*) so panels render before the first scrape populates the dropdown.", "hide": 0, "includeAll": true, "label": "Instance", From ecca34d8df5cf41324b13be9b9df5698bb1e6e51 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 16:02:13 +0530 Subject: [PATCH 06/11] docs(readme): pin Grafana dashboard import to 10.4+ Dashboard JSON is schema v39, which requires Grafana 10.4.x or newer (or any Grafana Cloud org). Earlier 9.x installs reject the import or silently drop panels. Call out the version requirement in the Grafana section so self-hosted operators running a stale Grafana do not hit a cryptic import error. Closes #278 --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cc4b6eb..e86c279 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,9 @@ The exporter binds `:9091` (not `:9090`) so it doesn't collide with a co-located ### Grafana dashboard -A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json). Three steps to load it into Grafana or Grafana Cloud: +A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json). The dashboard is built against **Grafana 10.4.x or newer** (panel schema v39 and Grafana Cloud both satisfy this); older 9.x installs will reject the import or silently drop panels. + +Three steps to load it into Grafana or Grafana Cloud: 1. Add a Prometheus data source that scrapes `http://:9091/metrics` (every ~10 s is fine). 2. In Grafana, **Dashboards → New → Import → Upload JSON file** and pick the file above. From dce8e36fa00bb1cc488dccfb46eb63102f5cdee0 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 16:03:45 +0530 Subject: [PATCH 07/11] docs(readme): security note on metrics scrape before Grafana import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The exporter currently binds 0.0.0.0:9091 without auth (tracked in #213 and #214). Directing operators to point a remote Prometheus at that URL bakes the exposure into the quickstart, leaking profit histograms, build SHA, queue depth, and sim results to anyone with network access — on a Hetzner VPS that is the public internet. Add a callout above the import steps: bind to 127.0.0.1 and tunnel, or put an authenticated reverse proxy in front of :9091, before configuring an external scrape. Step 1 echoes the same guidance so someone skim-reading the numbered list does not miss it. Closes #277 --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e86c279..b1a695c 100644 --- a/README.md +++ b/README.md @@ -84,9 +84,11 @@ The exporter binds `:9091` (not `:9090`) so it doesn't collide with a co-located A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json). The dashboard is built against **Grafana 10.4.x or newer** (panel schema v39 and Grafana Cloud both satisfy this); older 9.x installs will reject the import or silently drop panels. +> **Security — read before exposing `:9091`.** The metrics endpoint ships unauthenticated and binds `0.0.0.0` by default. On a public VPS (Hetzner CX22, the documented target) that exposes profit histograms, build SHA, queue depth, and simulation results to the internet. Before scraping from a remote Prometheus, either bind the exporter to `127.0.0.1` and scrape over a local socket / SSH tunnel / Tailscale, or put a reverse proxy with basic auth (or mTLS) in front of `:9091`. See tracking issues [#213](https://github.com/obchain/Charon/issues/213) and [#214](https://github.com/obchain/Charon/issues/214). + Three steps to load it into Grafana or Grafana Cloud: -1. Add a Prometheus data source that scrapes `http://:9091/metrics` (every ~10 s is fine). +1. Add a Prometheus data source that scrapes `http://:9091/metrics` (every ~10 s is fine). Use a loopback address, a VPN endpoint, or an authenticated reverse-proxy URL here — never a raw public-internet address. 2. In Grafana, **Dashboards → New → Import → Upload JSON file** and pick the file above. 3. On the import screen, select the Prometheus data source you created and click **Import**. From 83d74c5737009fc52d868b09852336072b2958f8 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 19:00:52 +0530 Subject: [PATCH 08/11] fix(grafana): hide git_sha in build-info until /metrics has auth exclude git_sha from the build-info table transform and document the deferred mempool/gas/rpc-latency panels in the dashboard description (tracked in #300, #301, #302). --- deploy/grafana/charon.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index d21b691..0d0c05e 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -12,7 +12,7 @@ } ] }, - "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091). Chain/Instance variables default to All (.*) so panels render before the first scrape populates label_values; they auto-refine once metrics flow.", + "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091). Chain/Instance variables default to All (.*) so panels render before the first scrape populates label_values; they auto-refine once metrics flow. Mempool / gas / RPC-latency panels deferred pending backing series: mempool #300, gas #301, rpc-latency #302. Alerting rules live in deploy/grafana/alerts.yaml.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -365,7 +365,7 @@ }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "description": "Running build metadata. `version` and `git_sha` come from the Charon process at startup.", + "description": "Running build metadata. Only `version` is surfaced; `git_sha` is intentionally hidden via transform exclusion until /metrics has auth (open #214) and LAN exposure is addressed (#213). Surfacing the exact SHA of a running binary to anyone with Grafana read access is an intelligence leak while those are unresolved.", "fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] @@ -395,7 +395,7 @@ { "id": "organize", "options": { - "excludeByName": { "Time": true, "Value": true, "__name__": true, "job": true }, + "excludeByName": { "Time": true, "Value": true, "__name__": true, "job": true, "git_sha": true }, "indexByName": {}, "renameByName": {} } From f61f72e1d6bdc8b8e869ac205b259dfaffe06427 Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 19:01:45 +0530 Subject: [PATCH 09/11] feat(grafana): alerting rules for scanner, queue, sim-fail, drop-rate five prometheus rules covering bot-down, 1h-zero-liquidations, queue depth spike, simulation failure rate, and opportunity drop rate. load via prometheus rule_files or grafana unified alerting. --- deploy/grafana/alerts.yaml | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 deploy/grafana/alerts.yaml diff --git a/deploy/grafana/alerts.yaml b/deploy/grafana/alerts.yaml new file mode 100644 index 0000000..31b7c87 --- /dev/null +++ b/deploy/grafana/alerts.yaml @@ -0,0 +1,90 @@ +--- +# Prometheus alerting rules for the Charon liquidation bot. +# +# Load via Prometheus (rule_files:) or Grafana unified alerting +# (Alerting -> Contact points -> Rules -> Upload file). +# +# Thresholds are tuned for a single-host BSC deployment on a 3s block +# cadence. Adjust per-environment before wiring a pager. + +groups: + - name: charon-availability + interval: 30s + rules: + - alert: CharonScannerStalled + expr: rate(charon_scanner_blocks_total[1m]) == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Charon scanner has stopped consuming blocks" + description: | + No blocks drained from the listener in the last minute on + {{ $labels.instance }} / chain {{ $labels.chain }}. The bot + is effectively down — every liquidation opportunity in this + window is missed. Check the container status and the WS + upstream. + + - alert: CharonNoLiquidationAttemptsOneHour + expr: increase(charon_executor_opportunities_queued_total[1h]) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon has queued zero opportunities for 1h" + description: | + increase(charon_executor_opportunities_queued_total[1h]) == 0 + on {{ $labels.instance }}. Either BSC is unusually healthy + (possible), the scanner is broken (possible), or the health + check thresholds are mis-tuned. Inspect + charon_scanner_positions_by_bucket to disambiguate. + + - name: charon-pipeline-health + interval: 30s + rules: + - alert: CharonQueueDepthHigh + expr: charon_executor_queue_depth > 50 + for: 2m + labels: + severity: warning + annotations: + summary: "Charon executor queue depth sustained above 50" + description: | + Queue depth {{ $value }} on {{ $labels.instance }} for 2 + minutes. Executor is not draining; possible private-RPC + stall, nonce-manager contention, or gas-ceiling skips. + + - alert: CharonSimulationFailureRateHigh + expr: | + sum by (instance, chain) (rate(charon_executor_simulations_total{result="failure"}[5m])) + / + sum by (instance, chain) (rate(charon_executor_simulations_total[5m])) + > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon simulation failure rate > 50% for 5m" + description: | + Over half of simulations are reverting on + {{ $labels.instance }} / chain {{ $labels.chain }}. Most + likely: Aave pool paused/frozen, Venus oracle stale, swap + router mispriced, or contract ABI drift. + + - alert: CharonOpportunityDropRateHigh + expr: | + sum by (instance, chain) (rate(charon_executor_opportunities_dropped_total[5m])) + / + sum by (instance, chain) (rate(charon_executor_opportunities_queued_total[5m])) + > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon dropping > 90% of queued opportunities" + description: | + Nearly every queued opportunity is dropping on + {{ $labels.instance }} / chain {{ $labels.chain }}. + Most likely upstream pipeline issue — fresher HF from a + newer block, gas ceiling too tight, or min-profit set above + current market conditions. From 7b3259a823f55dedd8d448b4b7937c6d4c55101b Mon Sep 17 00:00:00 2001 From: obchain Date: Thu, 23 Apr 2026 19:02:52 +0530 Subject: [PATCH 10/11] ci(grafana): lint dashboard json, schema, promql, alert rules adds grafana-lint workflow: json.tool parse, dashboard-linter schema and promql check, promtool check rules for alerts.yaml. replaces the pr-test-plan claim that json.tool alone validated the dashboard. --- .github/workflows/grafana-lint.yml | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/grafana-lint.yml diff --git a/.github/workflows/grafana-lint.yml b/.github/workflows/grafana-lint.yml new file mode 100644 index 0000000..53d5d4b --- /dev/null +++ b/.github/workflows/grafana-lint.yml @@ -0,0 +1,40 @@ +name: grafana-lint + +on: + push: + paths: + - 'deploy/grafana/**' + - '.github/workflows/grafana-lint.yml' + pull_request: + paths: + - 'deploy/grafana/**' + - '.github/workflows/grafana-lint.yml' + +permissions: + contents: read + +jobs: + validate: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Validate dashboard JSON syntax + run: python3 -m json.tool deploy/grafana/charon.json > /dev/null + + - uses: actions/setup-go@v5 + with: + go-version: '1.22' + + - name: Install grafana dashboard-linter + run: go install github.com/grafana/dashboard-linter@latest + + - name: Lint dashboard schema and PromQL + run: dashboard-linter lint --strict deploy/grafana/charon.json + + - name: Validate alert rules with promtool + run: | + docker run --rm -v "$PWD/deploy/grafana:/rules:ro" \ + prom/prometheus:v2.55.1 \ + promtool check rules /rules/alerts.yaml From cd068575a731c5266ea08dfbcae2f5b59437ebbb Mon Sep 17 00:00:00 2001 From: obchain Date: Fri, 24 Apr 2026 22:51:08 +0530 Subject: [PATCH 11/11] fix(grafana): align alerts.yaml metric names to charon-metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace non-existent charon_scanner_positions_by_bucket reference in CharonNoLiquidationAttemptsOneHour annotation with canonical charon_scanner_positions{bucket="near_liq"|"liquidatable"} series. - Fix CharonSimulationFailureRateHigh numerator: the sim_result label values declared in charon-metrics are "ok" / "revert" / "error" — there is no "failure". Use result=~"revert|error" so the ratio is non-zero when simulations actually fail. Names and labels now match crates/charon-metrics/src/lib.rs::names and ::sim_result on main. No Rust changes. --- deploy/grafana/alerts.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deploy/grafana/alerts.yaml b/deploy/grafana/alerts.yaml index 31b7c87..2efe529 100644 --- a/deploy/grafana/alerts.yaml +++ b/deploy/grafana/alerts.yaml @@ -37,7 +37,9 @@ groups: on {{ $labels.instance }}. Either BSC is unusually healthy (possible), the scanner is broken (possible), or the health check thresholds are mis-tuned. Inspect - charon_scanner_positions_by_bucket to disambiguate. + charon_scanner_positions{bucket="near_liq"} and + charon_scanner_positions{bucket="liquidatable"} to + disambiguate. - name: charon-pipeline-health interval: 30s @@ -56,7 +58,7 @@ groups: - alert: CharonSimulationFailureRateHigh expr: | - sum by (instance, chain) (rate(charon_executor_simulations_total{result="failure"}[5m])) + sum by (instance, chain) (rate(charon_executor_simulations_total{result=~"revert|error"}[5m])) / sum by (instance, chain) (rate(charon_executor_simulations_total[5m])) > 0.5