diff --git a/Makefile b/Makefile index 4a0ec13..5a02cb9 100644 --- a/Makefile +++ b/Makefile @@ -187,10 +187,12 @@ tf: ## use this to spawn a loaded shell export TF_VAR_GRAFANA_DATASOURCE_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/grafana_datasource.yml\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\r') && \ aws s3 cp infra/common/grafana/provisioning/dashboards/default.yml \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/grafana_provider.yml\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ export TF_VAR_GRAFANA_PROVIDER_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/grafana_provider.yml\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\r') && \ - aws s3 cp infra/common/grafana/provisioning/dashboards/ytdlp-health.json \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/ytdlp-health.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ - export TF_VAR_YTDLP_DASHBOARD_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/ytdlp-health.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\\r') && \ - aws s3 cp infra/common/grafana/provisioning/dashboards/captcha-security.json \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/captcha-security.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ - export TF_VAR_CAPTCHA_SECURITY_DASHBOARD_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/captcha-security.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\\r') && \ + aws s3 cp infra/common/grafana/provisioning/dashboards/api-health.json \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/api-health.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ + export TF_VAR_API_HEALTH_DASHBOARD_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/api-health.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\\r') && \ + aws s3 cp infra/common/grafana/provisioning/dashboards/security-overview.json \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/security-overview.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ + export TF_VAR_SECURITY_OVERVIEW_DASHBOARD_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/security-overview.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\\r') && \ + aws s3 cp infra/common/grafana/provisioning/dashboards/domain-services.json \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/domain-services.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" && \ + export TF_VAR_DOMAIN_SERVICES_DASHBOARD_URL=\$$(aws s3 presign \"s3://\$$AWS_S3_BUCKET_NAME/terraform/data/domain-services.json\" --endpoint-url \"\$$AWS_ENDPOINT_URL_S3\" --expires-in 3600 | tr -d '\\r') && \ export MSYS_NO_PATHCONV=1 && \ cd $(TF_STACK_DIR) && \ unset PROMPT_COMMAND && \ diff --git a/docker-compose.yml b/docker-compose.yml index bc862be..7853a13 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,6 @@ -# Integrated WARP Proxy. + +# Production docker-compose file: refer to infra/common/cloud-init.template + name: nadzu services: warp: @@ -18,7 +20,7 @@ services: depends_on: - warp volumes: - - "${MOUNT_PATH}:/home/app/downloads" + - "./production/downloads:/home/app/downloads" env_file: - .env environment: @@ -34,10 +36,10 @@ services: - "80:80" - "443:443" volumes: - - /opt/app/Caddyfile:/etc/caddy/Caddyfile:ro - - /opt/app/certs:/etc/caddy/certs:ro - - /opt/app/browse.html:/etc/caddy/browse.html:ro - - "${MOUNT_PATH}:/home/app/downloads:ro" + - Caddyfile:/etc/caddy/Caddyfile:ro + - ./infra/common/certs:/etc/caddy/certs:ro + - ./infra/common/browse.html:/etc/caddy/browse.html:ro + - "./production/downloads:/home/app/downloads:ro" depends_on: - app @@ -62,4 +64,4 @@ services: - :/etc/grafana/provisioning depends_on: - prometheus - \ No newline at end of file + diff --git a/infra/common/cloud-init.template b/infra/common/cloud-init.template index 161ef9b..542f144 100644 --- a/infra/common/cloud-init.template +++ b/infra/common/cloud-init.template @@ -248,8 +248,9 @@ write_files: fetch_url "${PROMETHEUS_CONFIG_URL}" /opt/app/prometheus/prometheus.yml fetch_url "${GRAFANA_DATASOURCE_URL}" /opt/app/grafana/provisioning/datasources/prometheus.yml fetch_url "${GRAFANA_PROVIDER_URL}" /opt/app/grafana/provisioning/dashboards/default.yml - fetch_url "${YTDLP_DASHBOARD_URL}" /opt/app/grafana/provisioning/dashboards/ytdlp-health.json - fetch_url "${CAPTCHA_SECURITY_DASHBOARD_URL}" /opt/app/grafana/provisioning/dashboards/captcha-security.json + fetch_url "${API_HEALTH_DASHBOARD_URL}" /opt/app/grafana/provisioning/dashboards/api-health.json + fetch_url "${SECURITY_OVERVIEW_DASHBOARD_URL}" /opt/app/grafana/provisioning/dashboards/security-overview.json + fetch_url "${DOMAIN_SERVICES_DASHBOARD_URL}" /opt/app/grafana/provisioning/dashboards/domain-services.json # APP_PORT: Dynamic sed -i "s/app:[0-9]*/app:${APP_PORT}/g" /opt/app/prometheus/prometheus.yml diff --git a/infra/common/grafana/provisioning/dashboards/api-health.json b/infra/common/grafana/provisioning/dashboards/api-health.json new file mode 100644 index 0000000..c39a8cc --- /dev/null +++ b/infra/common/grafana/provisioning/dashboards/api-health.json @@ -0,0 +1,88 @@ +{ + "title": "API & System Health", + "uid": "api-health", + "timezone": "browser", + "refresh": "5s", + "panels": [ + { + "title": "Global Traffic (RPS)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (status) (rate(http_requests_total[1m]))", + "legendFormat": "{{status}}" + } + ], + "options": { + "tooltip": { "mode": "multi" } + }, + "fieldConfig": { + "defaults": { + "custom": { "stacking": { "mode": "normal", "group": "A" } } + } + } + }, + { + "title": "Latency Percentiles", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "P99" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "P95" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))", + "legendFormat": "P50" + } + ] + }, + { + "title": "Traffic by Route", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (path) (increase(http_requests_total[1h]))", + "legendFormat": "{{path}}" + } + ], + "options": { + "pieType": "donut" + } + }, + { + "title": "Error Rate (%)", + "type": "stat", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100", + "legendFormat": "5xx Error Rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ] + } + } + } + } + ] +} \ No newline at end of file diff --git a/infra/common/grafana/provisioning/dashboards/captcha-security.json b/infra/common/grafana/provisioning/dashboards/captcha-security.json deleted file mode 100644 index 2beec11..0000000 --- a/infra/common/grafana/provisioning/dashboards/captcha-security.json +++ /dev/null @@ -1,172 +0,0 @@ -{ - "title": "Captcha & Security Dashboard", - "description": "Monitoring for Captcha verification hits, errors, and API key bypasses.", - "panels": [ - { - "title": "Total Captcha Requests (5m)", - "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum(increase(captcha_check_total[5m]))", - "instant": true, - "refId": "A" - } - ], - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "reduceOptions": { "calcs": ["lastNotNull"] } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } - } - } - }, - { - "title": "Success Rate (%)", - "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum(increase(captcha_check_total{status=\"success\"}[5m])) / sum(increase(captcha_check_total[5m])) * 100", - "instant": true, - "refId": "A" - } - ], - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "reduceOptions": { "calcs": ["lastNotNull"] } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } - } - } - }, - { - "title": "Bypass Rate (%)", - "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum(increase(captcha_check_total{status=\"bypass\"}[5m])) / sum(increase(captcha_check_total[5m])) * 100", - "instant": true, - "refId": "A" - } - ], - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "reduceOptions": { "calcs": ["lastNotNull"] } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } - } - } - }, - { - "title": "Failure Rate (%)", - "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum(increase(captcha_check_total{status=\"failure\"}[5m])) / sum(increase(captcha_check_total[5m])) * 100", - "instant": true, - "refId": "A" - } - ], - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "reduceOptions": { "calcs": ["lastNotNull"] } - }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] - } - } - } - }, - { - "title": "Captcha Traffic Status", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum by (status) (rate(captcha_check_total[1m]))", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "options": { - "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "multi", "sort": "none" } - } - }, - { - "title": "Failure Reasons (1h)", - "type": "piechart", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "sum by (reason) (increase(captcha_check_total{status=\"failure\"}[1h]))", - "legendFormat": "{{reason}}", - "refId": "A" - } - ], - "options": { - "displayLabels": ["percent"], - "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, - "pieType": "donut", - "tooltip": { "mode": "single", "sort": "none" } - } - }, - { - "title": "Verification Latency (seconds)", - "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, - "datasource": "Prometheus", - "targets": [ - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(captcha_verify_duration_seconds_bucket[5m])))", - "legendFormat": "P99 Latency", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(captcha_verify_duration_seconds_bucket[5m])))", - "legendFormat": "P95 Latency", - "refId": "B" - }, - { - "expr": "sum(rate(captcha_verify_duration_seconds_sum[5m])) / sum(rate(captcha_verify_duration_seconds_count[5m]))", - "legendFormat": "Avg Latency", - "refId": "C" - } - ], - "options": { - "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "multi", "sort": "none" } - } - } - ] -} diff --git a/infra/common/grafana/provisioning/dashboards/domain-services.json b/infra/common/grafana/provisioning/dashboards/domain-services.json new file mode 100644 index 0000000..822d27d --- /dev/null +++ b/infra/common/grafana/provisioning/dashboards/domain-services.json @@ -0,0 +1,69 @@ +{ + "title": "Domain Services", + "uid": "domain-services", + "timezone": "browser", + "refresh": "5s", + "panels": [ + { + "title": "YT-DLP Active Workers", + "type": "gauge", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "ytdlp_active_jobs", + "legendFormat": "Active Jobs" + } + ] + }, + { + "title": "YT-DLP Success vs Error", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (status) (rate(ytdlp_jobs_completed_total[5m]))", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { "stacking": { "mode": "normal", "group": "A" } } + } + } + }, + { + "title": "GitHub Cache Efficiency", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (source) (increase(github_contributions_fetch_total[1h]))", + "legendFormat": "{{source}}" + } + ], + "options": { + "pieType": "donut" + } + }, + { + "title": "GitHub Upstream Latency (P95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(github_api_duration_seconds_bucket[5m])))", + "legendFormat": "P95" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + } + } + } + ] +} \ No newline at end of file diff --git a/infra/common/grafana/provisioning/dashboards/security-overview.json b/infra/common/grafana/provisioning/dashboards/security-overview.json new file mode 100644 index 0000000..d18c499 --- /dev/null +++ b/infra/common/grafana/provisioning/dashboards/security-overview.json @@ -0,0 +1,69 @@ +{ + "title": "Security & Traffic Control", + "uid": "security-overview", + "timezone": "browser", + "refresh": "5s", + "panels": [ + { + "title": "Rate Limit Rejections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (tier) (rate(rate_limit_checks_total{status=\"rejected\"}[1m]))", + "legendFormat": "{{tier}}" + } + ] + }, + { + "title": "API Key Usage (1h)", + "type": "piechart", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (status) (increase(auth_api_key_check_total[1h]))", + "legendFormat": "{{status}}" + } + ], + "options": { + "pieType": "donut" + } + }, + { + "title": "Captcha Traffic Status", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "sum by (status) (rate(captcha_check_total[1m]))", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { "stacking": { "mode": "normal", "group": "A" } } + } + } + }, + { + "title": "Captcha Verification Latency (P99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": "Prometheus", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(captcha_verify_duration_seconds_bucket[5m])))", + "legendFormat": "P99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + } + } + } + ] +} \ No newline at end of file diff --git a/infra/common/grafana/provisioning/dashboards/ytdlp-health.json b/infra/common/grafana/provisioning/dashboards/ytdlp-health.json deleted file mode 100644 index 6b02448..0000000 --- a/infra/common/grafana/provisioning/dashboards/ytdlp-health.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "title": "Nadzu API - Worker Health", - "timezone": "browser", - "refresh": "5s", - "schemaVersion": 39, - "panels": [ - { - "title": "Current Success Rate", - "type": "stat", - "gridPos": { "h": 6, "w": 8, "x": 0, "y": 0 }, - "targets": [ - { - "datasource": "Prometheus", - "expr": "(((sum(ytdlp_jobs_completed_total{status=\"success\"}) or vector(0)) / (sum(ytdlp_jobs_completed_total) > 0)) * 100) or vector(100)", - "legendFormat": "Success Rate" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "noValue": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "orange", "value": 80 }, - { "color": "green", "value": 95 } - ] - } - } - } - }, - { - "title": "Active Workers (Saturation)", - "type": "gauge", - "gridPos": { "h": 6, "w": 8, "x": 8, "y": 0 }, - "targets": [ - { - "datasource": "Prometheus", - "expr": "ytdlp_active_jobs" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 10, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "orange", "value": 5 }, - { "color": "red", "value": 8 } - ] - } - } - } - }, - { - "title": "Recent Error Count (1h)", - "type": "stat", - "gridPos": { "h": 6, "w": 8, "x": 16, "y": 0 }, - "targets": [ - { - "datasource": "Prometheus", - "expr": "sum(increase(ytdlp_jobs_completed_total{status=\"error\"}[1h]))" - } - ], - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 1 } - ] - } - } - } - }, - { - "title": "Traffic Volume (Success vs Error)", - "type": "timeseries", - "gridPos": { "h": 9, "w": 24, "x": 0, "y": 6 }, - "targets": [ - { - "datasource": "Prometheus", - "expr": "sum by (status) (rate(ytdlp_jobs_completed_total[5m]))", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 20, - "lineInterpolation": "smooth", - "stacking": { "group": "A", "mode": "normal" } - } - }, - "overrides": [ - { - "matcher": { "id": "byName", "options": "success" }, - "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] - }, - { - "matcher": { "id": "byName", "options": "error" }, - "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] - } - ] - } - }, - { - "title": "Processing Latency (P50 vs P99)", - "type": "timeseries", - "gridPos": { "h": 9, "w": 24, "x": 0, "y": 15 }, - "targets": [ - { - "datasource": "Prometheus", - "expr": "ytdlp_job_duration_seconds{quantile=\"0.5\"}", - "legendFormat": "P50 (Median)" - }, - { - "datasource": "Prometheus", - "expr": "ytdlp_job_duration_seconds{quantile=\"0.99\"}", - "legendFormat": "P99 (Worst Case)" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s", - "custom": { "lineInterpolation": "smooth" } - } - } - } - ] -} \ No newline at end of file diff --git a/infra/digitalocean/accounts/naduns-team/main.tf b/infra/digitalocean/accounts/naduns-team/main.tf index 3e2e489..4167746 100644 --- a/infra/digitalocean/accounts/naduns-team/main.tf +++ b/infra/digitalocean/accounts/naduns-team/main.tf @@ -37,6 +37,7 @@ module "digitalocean_stack" { GRAFANA_PROVIDER_URL = var.GRAFANA_PROVIDER_URL GRAFANA_ADMIN_USER = var.GRAFANA_ADMIN_USER GRAFANA_ADMIN_PASSWORD = var.GRAFANA_ADMIN_PASSWORD - YTDLP_DASHBOARD_URL = var.YTDLP_DASHBOARD_URL - CAPTCHA_SECURITY_DASHBOARD_URL = var.CAPTCHA_SECURITY_DASHBOARD_URL + API_HEALTH_DASHBOARD_URL = var.API_HEALTH_DASHBOARD_URL + SECURITY_OVERVIEW_DASHBOARD_URL = var.SECURITY_OVERVIEW_DASHBOARD_URL + DOMAIN_SERVICES_DASHBOARD_URL = var.DOMAIN_SERVICES_DASHBOARD_URL } diff --git a/infra/digitalocean/components/locals.tf b/infra/digitalocean/components/locals.tf index bf9682b..350dc5f 100644 --- a/infra/digitalocean/components/locals.tf +++ b/infra/digitalocean/components/locals.tf @@ -35,8 +35,9 @@ locals { GRAFANA_PROVIDER_URL = var.GRAFANA_PROVIDER_URL GRAFANA_ADMIN_USER = var.GRAFANA_ADMIN_USER GRAFANA_ADMIN_PASSWORD = var.GRAFANA_ADMIN_PASSWORD - YTDLP_DASHBOARD_URL = var.YTDLP_DASHBOARD_URL - CAPTCHA_SECURITY_DASHBOARD_URL = var.CAPTCHA_SECURITY_DASHBOARD_URL + API_HEALTH_DASHBOARD_URL = var.API_HEALTH_DASHBOARD_URL + SECURITY_OVERVIEW_DASHBOARD_URL = var.SECURITY_OVERVIEW_DASHBOARD_URL + DOMAIN_SERVICES_DASHBOARD_URL = var.DOMAIN_SERVICES_DASHBOARD_URL SSH_ALLOWED_IPS = join(" ", var.SSH_ALLOWED_IPS) PRODUCTION_DOMAIN = join(".", [var.CLOUDFLARE_RECORD_NAME, var.CLOUDFLARE_ZONE_NAME]) CADDY_CLOUDFLARE_TRUSTED_PROXIES = join(" ", concat( diff --git a/src/app.rs b/src/app.rs index eae03b1..609b3d5 100644 --- a/src/app.rs +++ b/src/app.rs @@ -22,7 +22,7 @@ pub async fn run() { // 1. Load environment variables from .env file dotenv().ok(); - // 2. Initialize structured logging and environment-based log filtering + // 2. Initialize structured logging and prometheus metrics telemetry::init_tracing(); tracing::info!("nadzu app::run() starting"); @@ -73,6 +73,9 @@ pub async fn run() { state.clone(), enforce_tiered_rate_limit, )) + .layer(middleware::from_fn( + crate::middleware::metrics::track_http_metrics, + )) .layer(trace_layer) .layer(cors_layer) .layer(compression_layer); diff --git a/src/middleware/api_key.rs b/src/middleware/api_key.rs index e075470..3da2c7a 100644 --- a/src/middleware/api_key.rs +++ b/src/middleware/api_key.rs @@ -4,6 +4,7 @@ use axum::{ middleware::Next, response::Response, }; +use metrics::counter; use crate::{config::AppConfig, error::AppError, middleware::HEADER_API_KEY, state::AppState}; @@ -23,8 +24,10 @@ pub async fn require_api_key( next: Next, ) -> Result { if has_valid_master_api_key(req.headers(), state.config.as_ref()) { + counter!("auth_api_key_check_total", "status" => "valid").increment(1); Ok(next.run(req).await) } else { + counter!("auth_api_key_check_total", "status" => "invalid").increment(1); Err(AppError::Unauthorized( "Invalid or missing API key".to_string(), )) diff --git a/src/middleware/metrics.rs b/src/middleware/metrics.rs new file mode 100644 index 0000000..f1832bd --- /dev/null +++ b/src/middleware/metrics.rs @@ -0,0 +1,39 @@ +use axum::{ + extract::{MatchedPath, Request}, + middleware::Next, + response::Response, +}; +use metrics::{counter, histogram}; +use std::time::Instant; + +/// Middleware that records global HTTP traffic metrics. +pub async fn track_http_metrics(req: Request, next: Next) -> Response { + let method = req.method().to_string(); + let path = req.extensions().get::().map_or_else( + || req.uri().path().to_string(), + |matched_path| matched_path.as_str().to_string(), + ); + + let start = Instant::now(); + let response = next.run(req).await; + let latency = start.elapsed().as_secs_f64(); + let status = response.status().as_u16().to_string(); + + histogram!( + "http_request_duration_seconds", + "method" => method.clone(), + "path" => path.clone(), + "status" => status.clone() + ) + .record(latency); + + counter!( + "http_requests_total", + "method" => method, + "path" => path, + "status" => status + ) + .increment(1); + + response +} diff --git a/src/middleware/mod.rs b/src/middleware/mod.rs index 9f72a9d..c189b7d 100644 --- a/src/middleware/mod.rs +++ b/src/middleware/mod.rs @@ -3,6 +3,7 @@ use axum::http::header::HeaderName; pub mod api_key; pub mod captcha; pub mod cors; +pub mod metrics; pub mod rate_limit; pub const HEADER_API_KEY: HeaderName = HeaderName::from_static("x-api-key"); diff --git a/src/middleware/rate_limit.rs b/src/middleware/rate_limit.rs index 78c2dbd..fbfc144 100644 --- a/src/middleware/rate_limit.rs +++ b/src/middleware/rate_limit.rs @@ -139,12 +139,15 @@ pub async fn enforce_tiered_rate_limit( let client_key = request_client_key(&req, state.config.as_ref()); let limiter = state.rate_limiters.limiter_for_api_key(has_valid_api_key); + let tier = if has_valid_api_key { + "enhanced" + } else { + "normal" + }; + if limiter.check_key(&client_key).is_err() { - let tier = if has_valid_api_key { - "enhanced" - } else { - "normal" - }; + metrics::counter!("rate_limit_checks_total", "tier" => tier, "status" => "rejected") + .increment(1); debug!( client_key = %client_key, tier = tier, @@ -153,6 +156,9 @@ pub async fn enforce_tiered_rate_limit( return Err(AppError::Forbidden(format!("Rate limit exceeded ({tier})"))); } + metrics::counter!("rate_limit_checks_total", "tier" => tier, "status" => "allowed") + .increment(1); + Ok(next.run(req).await) } diff --git a/src/services/contributions.rs b/src/services/contributions.rs index fede238..8d6420b 100644 --- a/src/services/contributions.rs +++ b/src/services/contributions.rs @@ -1,4 +1,5 @@ use dashmap::DashMap; +use metrics::{counter, histogram}; use reqwest::Client; use serde::Serialize; use std::borrow::Cow; @@ -164,6 +165,7 @@ impl ContributionsService { if *expires_at > now { let mut resp = cached_resp.clone(); resp.meta.cached = true; + counter!("github_contributions_fetch_total", "source" => "cache", "status" => "success").increment(1); return Ok(resp); } } @@ -182,7 +184,10 @@ impl ContributionsService { if self.cache.len() < CACHE_MAX_CAPACITY { self.cache.insert(cache_key, (new_resp.clone(), expires_at)); + #[allow(clippy::cast_precision_loss)] + metrics::gauge!("github_cache_size").set(self.cache.len() as f64); } + metrics::counter!("github_contributions_fetch_total", "source" => "api", "status" => "success").increment(1); Ok(new_resp) } Err(e) => { @@ -190,8 +195,10 @@ impl ContributionsService { if let Some(entry) = self.cache.get(&cache_key) { let mut resp = entry.value().0.clone(); resp.meta.cached = true; + metrics::counter!("github_contributions_fetch_total", "source" => "cache", "status" => "stale_fallback").increment(1); return Ok(resp); } + metrics::counter!("github_contributions_fetch_total", "source" => "api", "status" => "error").increment(1); Err(e) } } @@ -210,6 +217,7 @@ impl ContributionsService { }, }; + let start_time = std::time::Instant::now(); let resp = self .http_client .post(&self.graphql_url) @@ -218,8 +226,12 @@ impl ContributionsService { .timeout(Duration::from_secs(30)) .json(&payload) .send() - .await - .map_err(|e| AppError::UpstreamError(format!("Network or timeout error: {e}")))?; + .await; + + histogram!("github_api_duration_seconds").record(start_time.elapsed().as_secs_f64()); + + let resp = + resp.map_err(|e| AppError::UpstreamError(format!("Network or timeout error: {e}")))?; if !resp.status().is_success() { return Err(AppError::UpstreamError(format!(