diff --git a/.github/workflows/grafana-lint.yml b/.github/workflows/grafana-lint.yml new file mode 100644 index 0000000..53d5d4b --- /dev/null +++ b/.github/workflows/grafana-lint.yml @@ -0,0 +1,40 @@ +name: grafana-lint + +on: + push: + paths: + - 'deploy/grafana/**' + - '.github/workflows/grafana-lint.yml' + pull_request: + paths: + - 'deploy/grafana/**' + - '.github/workflows/grafana-lint.yml' + +permissions: + contents: read + +jobs: + validate: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Validate dashboard JSON syntax + run: python3 -m json.tool deploy/grafana/charon.json > /dev/null + + - uses: actions/setup-go@v5 + with: + go-version: '1.22' + + - name: Install grafana dashboard-linter + run: go install github.com/grafana/dashboard-linter@latest + + - name: Lint dashboard schema and PromQL + run: dashboard-linter lint --strict deploy/grafana/charon.json + + - name: Validate alert rules with promtool + run: | + docker run --rm -v "$PWD/deploy/grafana:/rules:ro" \ + prom/prometheus:v2.55.1 \ + promtool check rules /rules/alerts.yaml diff --git a/README.md b/README.md index f2170cb..36444b8 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Charon monitors under-collateralized positions across major DeFi lending protoco - [Safety model](#safety-model) - [Getting started](#getting-started) - [Configuration](#configuration) +- [Metrics](#metrics) - [Project structure](#project-structure) - [Roadmap](#roadmap) - [Contributing](#contributing) @@ -228,6 +229,41 @@ The fork profile omits `[liquidator.bnb]` by default — after `forge create` ag --- +## Metrics + +Every profile ships with a Prometheus exporter enabled. Scrape `http://:9091/metrics`. The exporter binds `:9091` (not `:9090`) so it doesn't collide with a co-located Prometheus server. + +Key series (single source of truth in [`crates/charon-metrics/src/lib.rs`](crates/charon-metrics/src/lib.rs) — the `names` module is what dashboards and alert rules must match): + +| Metric | Type | Labels | +| --- | --- | --- | +| `charon_scanner_blocks_total` | counter | chain | +| `charon_scanner_positions` | gauge | chain, bucket | +| `charon_pipeline_block_duration_seconds` | histogram | chain | +| `charon_executor_simulations_total` | counter | chain, result | +| `charon_executor_opportunities_queued_total` | counter | chain | +| `charon_executor_opportunities_dropped_total` | counter | chain, stage | +| `charon_executor_profit_usd_cents` | histogram | chain | +| `charon_executor_queue_depth` | gauge | — | + +### Grafana dashboard + +A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json) and a matching alert-rule bundle at [`deploy/grafana/alerts.yaml`](deploy/grafana/alerts.yaml). The dashboard is built against **Grafana 10.4.x or newer** (panel schema v39 and Grafana Cloud both satisfy this); older 9.x installs will reject the import or silently drop panels. + +> **Security — read before exposing `:9091`.** The metrics endpoint ships unauthenticated and binds `0.0.0.0` by default. On a public VPS (Hetzner CX22, the documented target) that exposes profit histograms, build SHA, queue depth, and simulation results to the internet. Before scraping from a remote Prometheus, either bind the exporter to `127.0.0.1` and scrape over a local socket / SSH tunnel / Tailscale, or put a reverse proxy with basic auth (or mTLS) in front of `:9091`. See tracking issues [#213](https://github.com/obchain/Charon/issues/213) and [#214](https://github.com/obchain/Charon/issues/214). + +Three steps to load it into Grafana or Grafana Cloud: + +1. Add a Prometheus data source that scrapes `http://:9091/metrics` (every ~10 s is fine). Use a loopback address, a VPN endpoint, or an authenticated reverse-proxy URL here — never a raw public-internet address. +2. In Grafana, **Dashboards → New → Import → Upload JSON file** and pick the file above. +3. On the import screen, select the Prometheus data source you created and click **Import**. + +Dashboard UID is `charon-v0` and tags are `charon`, `liquidation`, `defi` — re-importing over an existing copy replaces it rather than duplicating. Variables (`Chain`, `Instance`) auto-populate from label values once metrics start flowing. + +Alert rules in `deploy/grafana/alerts.yaml` can be loaded by Prometheus via `rule_files:` or by Grafana unified alerting (**Alerting → Contact points → Rules → Upload file**). Thresholds are tuned for a single-host BSC deployment on a 3s block cadence — adjust per-environment before wiring a pager. + +--- + ## Project structure ``` diff --git a/deploy/grafana/alerts.yaml b/deploy/grafana/alerts.yaml new file mode 100644 index 0000000..2efe529 --- /dev/null +++ b/deploy/grafana/alerts.yaml @@ -0,0 +1,92 @@ +--- +# Prometheus alerting rules for the Charon liquidation bot. +# +# Load via Prometheus (rule_files:) or Grafana unified alerting +# (Alerting -> Contact points -> Rules -> Upload file). +# +# Thresholds are tuned for a single-host BSC deployment on a 3s block +# cadence. Adjust per-environment before wiring a pager. + +groups: + - name: charon-availability + interval: 30s + rules: + - alert: CharonScannerStalled + expr: rate(charon_scanner_blocks_total[1m]) == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Charon scanner has stopped consuming blocks" + description: | + No blocks drained from the listener in the last minute on + {{ $labels.instance }} / chain {{ $labels.chain }}. The bot + is effectively down — every liquidation opportunity in this + window is missed. Check the container status and the WS + upstream. + + - alert: CharonNoLiquidationAttemptsOneHour + expr: increase(charon_executor_opportunities_queued_total[1h]) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon has queued zero opportunities for 1h" + description: | + increase(charon_executor_opportunities_queued_total[1h]) == 0 + on {{ $labels.instance }}. Either BSC is unusually healthy + (possible), the scanner is broken (possible), or the health + check thresholds are mis-tuned. Inspect + charon_scanner_positions{bucket="near_liq"} and + charon_scanner_positions{bucket="liquidatable"} to + disambiguate. + + - name: charon-pipeline-health + interval: 30s + rules: + - alert: CharonQueueDepthHigh + expr: charon_executor_queue_depth > 50 + for: 2m + labels: + severity: warning + annotations: + summary: "Charon executor queue depth sustained above 50" + description: | + Queue depth {{ $value }} on {{ $labels.instance }} for 2 + minutes. Executor is not draining; possible private-RPC + stall, nonce-manager contention, or gas-ceiling skips. + + - alert: CharonSimulationFailureRateHigh + expr: | + sum by (instance, chain) (rate(charon_executor_simulations_total{result=~"revert|error"}[5m])) + / + sum by (instance, chain) (rate(charon_executor_simulations_total[5m])) + > 0.5 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon simulation failure rate > 50% for 5m" + description: | + Over half of simulations are reverting on + {{ $labels.instance }} / chain {{ $labels.chain }}. Most + likely: Aave pool paused/frozen, Venus oracle stale, swap + router mispriced, or contract ABI drift. + + - alert: CharonOpportunityDropRateHigh + expr: | + sum by (instance, chain) (rate(charon_executor_opportunities_dropped_total[5m])) + / + sum by (instance, chain) (rate(charon_executor_opportunities_queued_total[5m])) + > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "Charon dropping > 90% of queued opportunities" + description: | + Nearly every queued opportunity is dropping on + {{ $labels.instance }} / chain {{ $labels.chain }}. + Most likely upstream pipeline issue — fresher HF from a + newer block, gas ceiling too tight, or min-profit set above + current market conditions. diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json new file mode 100644 index 0000000..0d0c05e --- /dev/null +++ b/deploy/grafana/charon.json @@ -0,0 +1,474 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Charon liquidation bot — scanner, executor, and profit telemetry. Scrapes the charon-metrics Prometheus exporter (default :9091). Chain/Instance variables default to All (.*) so panels render before the first scrape populates label_values; they auto-refine once metrics flow. Mempool / gas / RPC-latency panels deferred pending backing series: mempool #300, gas #301, rpc-latency #302. Alerting rules live in deploy/grafana/alerts.yaml.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Rate of blocks drained from the chain listener, per chain.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "blocks / sec", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (chain) (rate(charon_scanner_blocks_total{instance=~\"$instance\",chain=~\"$chain\"}[1m]))", + "legendFormat": "{{chain}}", + "range": true, + "refId": "A" + } + ], + "title": "Scanner — blocks per second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Per-block pipeline wall-clock latency. p50 and p95 from the histogram. BSC produces a block every ~3s; quantiles use a [5m] range (~100 observations) so they stay stable across scrapes.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "seconds", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le, chain) (rate(charon_pipeline_block_duration_seconds_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m])))", + "legendFormat": "p50 {{chain}}", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, chain) (rate(charon_pipeline_block_duration_seconds_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m])))", + "legendFormat": "p95 {{chain}}", + "range": true, + "refId": "B" + } + ], + "title": "Pipeline — block latency p50/p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Currently tracked positions bucketed by health classification.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "positions", + "drawStyle": "line", + "fillOpacity": 60, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "healthy" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }] }, + { "matcher": { "id": "byName", "options": "near_liq" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "yellow" } }] }, + { "matcher": { "id": "byName", "options": "liquidatable" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (bucket) (charon_scanner_positions{instance=~\"$instance\",chain=~\"$chain\"})", + "legendFormat": "{{bucket}}", + "range": true, + "refId": "A" + } + ], + "title": "Scanner — positions by bucket", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Current depth of the profit-ordered opportunity queue.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 10 }, + { "color": "red", "value": 50 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 8 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "charon_executor_queue_depth{instance=~\"$instance\"}", + "legendFormat": "queue depth", + "range": true, + "refId": "A" + } + ], + "title": "Queue depth", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Profit across all opportunities that cleared the profit gate (USD), summed over the selected dashboard time range. Uses increase() over the histogram _sum accumulator so process restarts do not appear as negative steps; the window follows the top-right time picker.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(increase(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\"}[$__range])) / 100", + "legendFormat": "profit (selected range)", + "range": true, + "refId": "A" + } + ], + "title": "Profit (selected range)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Simulation outcomes per minute, partitioned by result.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "sim / min", + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "ok" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "green" } }] }, + { "matcher": { "id": "byName", "options": "revert" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 6, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (result) (rate(charon_executor_simulations_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", + "legendFormat": "{{result}}", + "range": true, + "refId": "A" + } + ], + "title": "Executor — simulations per minute", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Opportunity funnel, per minute, grouped by stage. `queued` counts entries that cleared the profit gate and landed in the queue (the #220 `simulated` label is rolled up here); `router`, `profit`, `simulation`, and `build` are drop stages. Both series share the same `stage` axis so intake and losses are directly comparable.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "opps / min", + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never", + "stacking": { "group": "A", "mode": "normal" } + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 7, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "label_replace(sum(rate(charon_executor_opportunities_queued_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60, \"stage\", \"queued\", \"\", \"\")", + "legendFormat": "{{stage}}", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (stage) (rate(charon_executor_opportunities_dropped_total{instance=~\"$instance\",chain=~\"$chain\"}[1m])) * 60", + "legendFormat": "{{stage}}", + "range": true, + "refId": "B" + } + ], + "title": "Executor — opportunities queued vs dropped", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Distribution of per-opportunity net profit (USD cents). Heat-map uses the underlying histogram buckets.", + "fieldConfig": { + "defaults": { + "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "scaleDistribution": { "type": "linear" } } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 8, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { "color": "rgba(255,0,255,0.7)" }, + "filterValues": { "le": 1e-9 }, + "legend": { "show": true }, + "rowsFrame": { "layout": "auto" }, + "tooltip": { "mode": "single", "showColorScale": false, "yHistogram": false }, + "yAxis": { "axisPlacement": "left", "reverse": false, "unit": "short" } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (le) (rate(charon_executor_profit_usd_cents_bucket{instance=~\"$instance\",chain=~\"$chain\"}[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Executor — per-opportunity profit distribution (cents)", + "type": "heatmap" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Running build metadata. Only `version` is surfaced; `git_sha` is intentionally hidden via transform exclusion until /metrics has auth (open #214) and LAN exposure is addressed (#213). Surfacing the exact SHA of a running binary to anyone with Grafana read access is an intelligence leak while those are unresolved.", + "fieldConfig": { + "defaults": { "custom": { "align": "auto" } }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "charon_build_info{instance=~\"$instance\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Build info", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "Value": true, "__name__": true, "job": true, "git_sha": true }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["charon", "liquidation", "defi"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(charon_scanner_blocks_total, chain)", + "description": "Chain label. Defaults to All (.*) so panels render data even before label_values is populated (fresh import or cold start).", + "hide": 0, + "includeAll": true, + "label": "Chain", + "multi": true, + "name": "chain", + "options": [], + "query": { "query": "label_values(charon_scanner_blocks_total, chain)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(charon_build_info, instance)", + "description": "Instance label (Prometheus scrape target). Defaults to All (.*) so panels render before the first scrape populates the dropdown.", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { "query": "label_values(charon_build_info, instance)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-15m", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Charon Bot", + "uid": "charon-v0", + "version": 1, + "weekStart": "" +}