obchain · obchain · Apr 24, 2026 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.github/workflows/grafana-lint.yml b/.github/workflows/grafana-lint.yml
@@ -0,0 +1,40 @@
+name: grafana-lint
+
+on:
+  push:
+    paths:
+      - 'deploy/grafana/**'
+      - '.github/workflows/grafana-lint.yml'
+  pull_request:
+    paths:
+      - 'deploy/grafana/**'
+      - '.github/workflows/grafana-lint.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Validate dashboard JSON syntax
+        run: python3 -m json.tool deploy/grafana/charon.json > /dev/null
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.22'
+
+      - name: Install grafana dashboard-linter
+        run: go install github.com/grafana/dashboard-linter@latest
+
+      - name: Lint dashboard schema and PromQL
+        run: dashboard-linter lint --strict deploy/grafana/charon.json
+
+      - name: Validate alert rules with promtool
+        run: |
+          docker run --rm -v "$PWD/deploy/grafana:/rules:ro" \
+            prom/prometheus:v2.55.1 \
+            promtool check rules /rules/alerts.yaml
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ Charon monitors under-collateralized positions across major DeFi lending protoco
 - [Safety model](#safety-model)
 - [Getting started](#getting-started)
 - [Configuration](#configuration)
+- [Metrics](#metrics)
 - [Project structure](#project-structure)
 - [Roadmap](#roadmap)
 - [Contributing](#contributing)
@@ -228,6 +229,41 @@ The fork profile omits `[liquidator.bnb]` by default — after `forge create` ag
 
 ---
 
+## Metrics
+
+Every profile ships with a Prometheus exporter enabled. Scrape `http://<host>:9091/metrics`. The exporter binds `:9091` (not `:9090`) so it doesn't collide with a co-located Prometheus server.
+
+Key series (single source of truth in [`crates/charon-metrics/src/lib.rs`](crates/charon-metrics/src/lib.rs) — the `names` module is what dashboards and alert rules must match):
+
+| Metric | Type | Labels |
+| --- | --- | --- |
+| `charon_scanner_blocks_total` | counter | chain |
+| `charon_scanner_positions` | gauge | chain, bucket |
+| `charon_pipeline_block_duration_seconds` | histogram | chain |
+| `charon_executor_simulations_total` | counter | chain, result |
+| `charon_executor_opportunities_queued_total` | counter | chain |
+| `charon_executor_opportunities_dropped_total` | counter | chain, stage |
+| `charon_executor_profit_usd_cents` | histogram | chain |
+| `charon_executor_queue_depth` | gauge | — |
+
+### Grafana dashboard
+
+A ready-to-import dashboard lives at [`deploy/grafana/charon.json`](deploy/grafana/charon.json) and a matching alert-rule bundle at [`deploy/grafana/alerts.yaml`](deploy/grafana/alerts.yaml). The dashboard is built against **Grafana 10.4.x or newer** (panel schema v39 and Grafana Cloud both satisfy this); older 9.x installs will reject the import or silently drop panels.
+
+> **Security — read before exposing `:9091`.** The metrics endpoint ships unauthenticated and binds `0.0.0.0` by default. On a public VPS (Hetzner CX22, the documented target) that exposes profit histograms, build SHA, queue depth, and simulation results to the internet. Before scraping from a remote Prometheus, either bind the exporter to `127.0.0.1` and scrape over a local socket / SSH tunnel / Tailscale, or put a reverse proxy with basic auth (or mTLS) in front of `:9091`. See tracking issues [#213](https://github.com/obchain/Charon/issues/213) and [#214](https://github.com/obchain/Charon/issues/214).
+
+Three steps to load it into Grafana or Grafana Cloud:
+
+1. Add a Prometheus data source that scrapes `http://<charon-host>:9091/metrics` (every ~10 s is fine). Use a loopback address, a VPN endpoint, or an authenticated reverse-proxy URL here — never a raw public-internet address.
+2. In Grafana, **Dashboards → New → Import → Upload JSON file** and pick the file above.
+3. On the import screen, select the Prometheus data source you created and click **Import**.
+
+Dashboard UID is `charon-v0` and tags are `charon`, `liquidation`, `defi` — re-importing over an existing copy replaces it rather than duplicating. Variables (`Chain`, `Instance`) auto-populate from label values once metrics start flowing.
+
+Alert rules in `deploy/grafana/alerts.yaml` can be loaded by Prometheus via `rule_files:` or by Grafana unified alerting (**Alerting → Contact points → Rules → Upload file**). Thresholds are tuned for a single-host BSC deployment on a 3s block cadence — adjust per-environment before wiring a pager.
+
+---
+
 ## Project structure
 
 ```

diff --git a/deploy/grafana/alerts.yaml b/deploy/grafana/alerts.yaml
@@ -0,0 +1,92 @@
+---
+# Prometheus alerting rules for the Charon liquidation bot.
+#
+# Load via Prometheus (rule_files:) or Grafana unified alerting
+# (Alerting -> Contact points -> Rules -> Upload file).
+#
+# Thresholds are tuned for a single-host BSC deployment on a 3s block
+# cadence. Adjust per-environment before wiring a pager.
+
+groups:
+  - name: charon-availability
+    interval: 30s
+    rules:
+      - alert: CharonScannerStalled
+        expr: rate(charon_scanner_blocks_total[1m]) == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Charon scanner has stopped consuming blocks"
+          description: |
+            No blocks drained from the listener in the last minute on
+            {{ $labels.instance }} / chain {{ $labels.chain }}. The bot
+            is effectively down — every liquidation opportunity in this
+            window is missed. Check the container status and the WS
+            upstream.
+
+      - alert: CharonNoLiquidationAttemptsOneHour
+        expr: increase(charon_executor_opportunities_queued_total[1h]) == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Charon has queued zero opportunities for 1h"
+          description: |
+            increase(charon_executor_opportunities_queued_total[1h]) == 0
+            on {{ $labels.instance }}. Either BSC is unusually healthy
+            (possible), the scanner is broken (possible), or the health
+            check thresholds are mis-tuned. Inspect
+            charon_scanner_positions{bucket="near_liq"} and
+            charon_scanner_positions{bucket="liquidatable"} to
+            disambiguate.
+
+  - name: charon-pipeline-health
+    interval: 30s
+    rules:
+      - alert: CharonQueueDepthHigh
+        expr: charon_executor_queue_depth > 50
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Charon executor queue depth sustained above 50"
+          description: |
+            Queue depth {{ $value }} on {{ $labels.instance }} for 2
+            minutes. Executor is not draining; possible private-RPC
+            stall, nonce-manager contention, or gas-ceiling skips.
+
+      - alert: CharonSimulationFailureRateHigh
+        expr: |
+          sum by (instance, chain) (rate(charon_executor_simulations_total{result=~"revert|error"}[5m]))
+            /
+          sum by (instance, chain) (rate(charon_executor_simulations_total[5m]))
+            > 0.5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Charon simulation failure rate > 50% for 5m"
+          description: |
+            Over half of simulations are reverting on
+            {{ $labels.instance }} / chain {{ $labels.chain }}. Most
+            likely: Aave pool paused/frozen, Venus oracle stale, swap
+            router mispriced, or contract ABI drift.
+
+      - alert: CharonOpportunityDropRateHigh
+        expr: |
+          sum by (instance, chain) (rate(charon_executor_opportunities_dropped_total[5m]))
+            /
+          sum by (instance, chain) (rate(charon_executor_opportunities_queued_total[5m]))
+            > 0.9
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Charon dropping > 90% of queued opportunities"
+          description: |
+            Nearly every queued opportunity is dropping on
+            {{ $labels.instance }} / chain {{ $labels.chain }}.
+            Most likely upstream pipeline issue — fresher HF from a
+            newer block, gas ceiling too tight, or min-profit set above
+            current market conditions.