From 15e8cf343c9e0ea2ffc67c64980f9b368c80ea5f Mon Sep 17 00:00:00 2001 From: Chris Stockton Date: Fri, 28 Nov 2025 11:12:34 -0700 Subject: [PATCH 1/4] fix: set restart limits to 0 to prevent being marked as failed The systemd default is 10s / 5 for these values with a DefaultRestartUSec of 100ms. Most services set a RestartSec limit of 3, under most circumstances it takes 15s to restart 5 times so the limit of 10s is not exceeded. However if other system processes (salt, cloud init) restart it explicitly, or recovering system services within the --before chain trigger a restart the limit can be exceeded causing it to be marked as failed. Since no services mark gotrue.service as required it will remain offline until the next explicit restart is issued. Setting these values to 0 with Restart=always and RestartSec=3 will prevent gotrue from being marked as failed. --- ansible/files/gotrue.service.j2 | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ansible/files/gotrue.service.j2 b/ansible/files/gotrue.service.j2 index 144448cc6..dbcbd03fe 100644 --- a/ansible/files/gotrue.service.j2 +++ b/ansible/files/gotrue.service.j2 @@ -40,9 +40,19 @@ After=network-online.target systemd-resolved.service Wants=postgresql.service After=postgresql.service -# Lower start limit ival and burst to prevent the noisy flapping -StartLimitIntervalSec=10 -StartLimitBurst=5 +# The systemd default is 10s / 5 for these values with a DefaultRestartUSec of +# 100ms. Most services set a RestartSec limit of 3, under most circumstances it +# takes 15s to restart 5 times so the limit of 10s is not exceeded. However if +# other system processes (salt, cloud init) restart it explicitly, or recovering +# system services within the --before chain trigger a restart the limit can be +# exceeded causing it to be marked as failed. Since no services mark +# gotrue.service as required it will remain offline until the next explicit +# restart is issued. +# +# Setting these values to 0 with Restart=always and RestartSec=3 will prevent +# gotrue from being marked as failed. +StartLimitIntervalSec=0 +StartLimitBurst=0 [Service] Type=exec From a0f7be80b9a7b313a08ef524fc2a6749d4eedef8 Mon Sep 17 00:00:00 2001 From: Chris Stockton Date: Mon, 1 Dec 2025 15:59:23 -0700 Subject: [PATCH 2/4] chore: set StartLimits for persistent services. I've noticed all !oneshot services set a `RestartSec` of `3s` and we use the systemd defaults of `StartLimitBurst=5` and `StartLimitInterval=10s`. Together this forms a property that under typical conditions a service will be restarted indefinitely until it comes back up due to `(3s * 5) > 10s`, but it is still possible for a service to enter a failed state under some scenarios. This change defensively sets them to 0/0 to keep them in restart loops. --- ansible/files/adminapi.service.j2 | 5 ++--- ansible/files/nginx.service.j2 | 3 +++ ansible/files/pg_egress_collect.service.j2 | 3 +++ ansible/files/postgres_exporter.service.j2 | 3 +++ ansible/files/postgrest.service.j2 | 3 +++ ansible/files/vector.service.j2 | 3 +++ 6 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ansible/files/adminapi.service.j2 b/ansible/files/adminapi.service.j2 index cc1e9dc2a..305d1ac62 100644 --- a/ansible/files/adminapi.service.j2 +++ b/ansible/files/adminapi.service.j2 @@ -3,9 +3,8 @@ Description=AdminAPI Requires=network-online.target After=network-online.target -# Move this to the Service section if on systemd >=250 -StartLimitIntervalSec=60 -StartLimitBurst=10 +StartLimitIntervalSec=0 +StartLimitBurst=0 [Service] Type=simple diff --git a/ansible/files/nginx.service.j2 b/ansible/files/nginx.service.j2 index 872e3346a..a43c3df60 100644 --- a/ansible/files/nginx.service.j2 +++ b/ansible/files/nginx.service.j2 @@ -3,6 +3,9 @@ Description=nginx server After=postgrest.service gotrue.service adminapi.service Wants=postgrest.service gotrue.service adminapi.service +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=forking ExecStart=/usr/local/nginx/sbin/nginx -c /etc/nginx/nginx.conf diff --git a/ansible/files/pg_egress_collect.service.j2 b/ansible/files/pg_egress_collect.service.j2 index 7ac04f47d..36e1b2074 100644 --- a/ansible/files/pg_egress_collect.service.j2 +++ b/ansible/files/pg_egress_collect.service.j2 @@ -1,6 +1,9 @@ [Unit] Description=Postgres Egress Collector +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple ExecStart=/bin/bash -c "tcpdump -s 128 -Q out -nn -tt -vv -p -l 'tcp and (port 5432 or port 6543)' | perl /root/pg_egress_collect.pl" diff --git a/ansible/files/postgres_exporter.service.j2 b/ansible/files/postgres_exporter.service.j2 index 6baa18c0d..dcb107cb7 100644 --- a/ansible/files/postgres_exporter.service.j2 +++ b/ansible/files/postgres_exporter.service.j2 @@ -1,6 +1,9 @@ [Unit] Description=Postgres Exporter +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple ExecStart=/opt/postgres_exporter/postgres_exporter --disable-settings-metrics --extend.query-path="/opt/postgres_exporter/queries.yml" --disable-default-metrics --no-collector.locks --no-collector.replication --no-collector.replication_slot --no-collector.stat_bgwriter --no-collector.stat_database --no-collector.stat_user_tables --no-collector.statio_user_tables --no-collector.wal {% if qemu_mode is defined and qemu_mode %}--no-collector.database {% endif %} diff --git a/ansible/files/postgrest.service.j2 b/ansible/files/postgrest.service.j2 index 290f07720..61102cb42 100644 --- a/ansible/files/postgrest.service.j2 +++ b/ansible/files/postgrest.service.j2 @@ -3,6 +3,9 @@ Description=PostgREST Requires=postgrest-optimizations.service After=postgrest-optimizations.service +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple # We allow the base config (sent from the worker) to override the generated config diff --git a/ansible/files/vector.service.j2 b/ansible/files/vector.service.j2 index 1c88baa20..05c11e453 100644 --- a/ansible/files/vector.service.j2 +++ b/ansible/files/vector.service.j2 @@ -4,6 +4,9 @@ Documentation=https://vector.dev After=network-online.target Requires=network-online.target +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] User=vector Group=vector From 3f3e133f94f0ba6fc7be3fc0d7518ea5d31117a4 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Tue, 30 Dec 2025 21:14:16 -0500 Subject: [PATCH 3/4] chore: suffix to test --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 679f1a42b..1ea76c6db 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.023-orioledb" - postgres17: "17.6.1.066" - postgres15: "15.14.1.066" + postgresorioledb-17: "17.6.0.024-orioledb-auth-1" + postgres17: "17.6.1.067-auth-1" + postgres15: "15.14.1.067-auth-1" # Non Postgres Extensions pgbouncer_release: 1.19.0 From 0646ba3ed09cd278e8a33d3942ac85e8f20285b2 Mon Sep 17 00:00:00 2001 From: Sam Rose Date: Wed, 31 Dec 2025 01:33:16 -0500 Subject: [PATCH 4/4] chore: bump to release --- ansible/vars.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/vars.yml b/ansible/vars.yml index 1ea76c6db..a5b9c7215 100644 --- a/ansible/vars.yml +++ b/ansible/vars.yml @@ -10,9 +10,9 @@ postgres_major: # Full version strings for each major version postgres_release: - postgresorioledb-17: "17.6.0.024-orioledb-auth-1" - postgres17: "17.6.1.067-auth-1" - postgres15: "15.14.1.067-auth-1" + postgresorioledb-17: "17.6.0.024-orioledb" + postgres17: "17.6.1.067" + postgres15: "15.14.1.067" # Non Postgres Extensions pgbouncer_release: 1.19.0