From e43cd9099122a9a935659fa8ec349e5f0916807d Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:50:27 -0300 Subject: [PATCH 1/3] infra: add watchdog for batcher --- infra/watchdog/batcher/.env.example | 6 ++++ infra/watchdog/batcher/README.md | 36 ++++++++++++++++++++++ infra/watchdog/batcher/batcher_watchdog.sh | 34 ++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 infra/watchdog/batcher/.env.example create mode 100644 infra/watchdog/batcher/README.md create mode 100644 infra/watchdog/batcher/batcher_watchdog.sh diff --git a/infra/watchdog/batcher/.env.example b/infra/watchdog/batcher/.env.example new file mode 100644 index 0000000000..e7d17d1fd2 --- /dev/null +++ b/infra/watchdog/batcher/.env.example @@ -0,0 +1,6 @@ +PROMETHEUS_URL=: +SYSTEMD_SERVICE=batcher +PROMETHEUS_COUNTER=sent_batches +PROMETHEUS_BOT=batcher +PROMETHEUS_INTERVAL=20m +SLACK_WEBHOOK_URL=<> \ No newline at end of file diff --git a/infra/watchdog/batcher/README.md b/infra/watchdog/batcher/README.md new file mode 100644 index 0000000000..87a6e79681 --- /dev/null +++ b/infra/watchdog/batcher/README.md @@ -0,0 +1,36 @@ +# Batcher Watchdog + +The Batcher Watchdog checks a prometheus metric and restart the batcher as needed + +## Configuration + +You need to create a .env file with the following variables + +``` +PROMETHEUS_URL=: +SYSTEMD_SERVICE=batcher +PROMETHEUS_COUNTER=sent_batches +PROMETHEUS_BOT=batcher +PROMETHEUS_INTERVAL=20m +SLACK_WEBHOOK_URL=<> +``` + +There is a `.env.example` file in this directory. + +## Run with Crontab + +Open the Crontab configuration with `crontab -e` and add the following line: + +``` +*/20 * * * * /path/to/watchdog/batcher_watchdog.sh /path/to/config/.env >> /path/to/logs/folder/batcher_watchdog.log 2>&1 +``` + +You can check logs in the specified file, for example: + +``` +Tue Oct 15 08:00:01 UTC 2024: tasks created in the last 20m: "25" +Tue Oct 15 08:20:01 UTC 2024: tasks created in the last 20m: "2" +Tue Oct 15 08:40:01 UTC 2024: tasks created in the last 20m: "0" +Tue Oct 15 08:40:01 UTC 2024: restarting batcher +Tue Oct 15 08:40:01 UTC 2024: batcher restarted +``` diff --git a/infra/watchdog/batcher/batcher_watchdog.sh b/infra/watchdog/batcher/batcher_watchdog.sh new file mode 100644 index 0000000000..437749eb35 --- /dev/null +++ b/infra/watchdog/batcher/batcher_watchdog.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Load env file from first parameter +# Env variables: +# - PROMETHEUS_URL +# - SYSTEMD_SERVICE +# - PROMETHEUS_COUNTER +# - PROMETHEUS_BOT +# - PROMETHEUS_INTERVAL +# - SLACK_WEBHOOK_URL +source $1 + +# Function to send slack message +# @param message +function send_slack_message() { + curl -X POST -H 'Content-type: application/json' \ + --data "{\"text\":\"$1\"}" \ + $SLACK_WEBHOOK_URL +} + +# Get rate from prometheus +rate=$(curl -gs 'http://'$PROMETHEUS_URL'/api/v1/query?query=floor(increase('$PROMETHEUS_COUNTER'{bot="'$PROMETHEUS_BOT'"}['$PROMETHEUS_INTERVAL']))' | jq '.data.result[0].value[1]') + +echo "$(date): tasks created in the last $PROMETHEUS_INTERVAL: $rate" + +# Check if rate is 0 +if [ "$rate" = \"0\" ]; then + # Restart systemd service + echo "$(date): restarting $SYSTEMD_SERVICE" + sudo systemctl restart $SYSTEMD_SERVICE + message="$(date): $SYSTEMD_SERVICE restarted" + echo $message + send_slack_message "$message" +fi \ No newline at end of file From d87de5bde60d51fc8c17cd4b7017e3c6adba458a Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:51:15 -0300 Subject: [PATCH 2/3] nit --- infra/watchdog/batcher/.env.example | 2 +- infra/watchdog/batcher/batcher_watchdog.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/watchdog/batcher/.env.example b/infra/watchdog/batcher/.env.example index e7d17d1fd2..336f7774f4 100644 --- a/infra/watchdog/batcher/.env.example +++ b/infra/watchdog/batcher/.env.example @@ -3,4 +3,4 @@ SYSTEMD_SERVICE=batcher PROMETHEUS_COUNTER=sent_batches PROMETHEUS_BOT=batcher PROMETHEUS_INTERVAL=20m -SLACK_WEBHOOK_URL=<> \ No newline at end of file +SLACK_WEBHOOK_URL=<> diff --git a/infra/watchdog/batcher/batcher_watchdog.sh b/infra/watchdog/batcher/batcher_watchdog.sh index 437749eb35..2ccb9842c4 100644 --- a/infra/watchdog/batcher/batcher_watchdog.sh +++ b/infra/watchdog/batcher/batcher_watchdog.sh @@ -31,4 +31,4 @@ if [ "$rate" = \"0\" ]; then message="$(date): $SYSTEMD_SERVICE restarted" echo $message send_slack_message "$message" -fi \ No newline at end of file +fi From 04eef6ddc14ea00c8dab87f2479c1bd07d90a768 Mon Sep 17 00:00:00 2001 From: JuArce <52429267+JuArce@users.noreply.github.com> Date: Thu, 24 Oct 2024 17:26:01 -0300 Subject: [PATCH 3/3] fix: address comments from review --- infra/watchdog/batcher/README.md | 10 ++++++++-- infra/watchdog/batcher/batcher_watchdog.sh | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/infra/watchdog/batcher/README.md b/infra/watchdog/batcher/README.md index 87a6e79681..df66213371 100644 --- a/infra/watchdog/batcher/README.md +++ b/infra/watchdog/batcher/README.md @@ -1,6 +1,10 @@ # Batcher Watchdog -The Batcher Watchdog checks a prometheus metric and restart the batcher as needed +The Batcher Watchdog checks a prometheus metric and restart the Batcher as needed. + +The metric is the quantity of batches sent in the last N minutes, defined in the PROMETHEUS_INTERVAL variable. Lets call this metric `sent_batches`. + +Since we are sending proofs constantly, the ideal behaviour is the creation of a task every 3 Ethereum blocks (~36 secs). So, if the `sent_batches` metrics is 0 it means there is a problem in the Batcher, for example a transaction is stuck in Ethereum and the Batcher is locked waiting for the transaction. If this happens, the Watchdog restarts the Batcher. ## Configuration @@ -22,9 +26,11 @@ There is a `.env.example` file in this directory. Open the Crontab configuration with `crontab -e` and add the following line: ``` -*/20 * * * * /path/to/watchdog/batcher_watchdog.sh /path/to/config/.env >> /path/to/logs/folder/batcher_watchdog.log 2>&1 +*/10 * * * * /path/to/watchdog/batcher_watchdog.sh /path/to/config/.env >> /path/to/logs/folder/batcher_watchdog.log 2>&1 ``` +The cron interval has to be the half of PROMETHEUS_INTERVAL (PROMETHEUS_INTERVAL/2). + You can check logs in the specified file, for example: ``` diff --git a/infra/watchdog/batcher/batcher_watchdog.sh b/infra/watchdog/batcher/batcher_watchdog.sh index 2ccb9842c4..47d8efd518 100644 --- a/infra/watchdog/batcher/batcher_watchdog.sh +++ b/infra/watchdog/batcher/batcher_watchdog.sh @@ -28,7 +28,7 @@ if [ "$rate" = \"0\" ]; then # Restart systemd service echo "$(date): restarting $SYSTEMD_SERVICE" sudo systemctl restart $SYSTEMD_SERVICE - message="$(date): $SYSTEMD_SERVICE restarted" + message="$(date): $SYSTEMD_SERVICE restarted by watchdog" echo $message send_slack_message "$message" fi