diff --git a/grafana/common/Bloat_Details.json b/grafana/common/Bloat_Details.json index a9419dc6..0b6efcfb 100644 --- a/grafana/common/Bloat_Details.json +++ b/grafana/common/Bloat_Details.json @@ -362,7 +362,7 @@ "query": "label_values(up{exp_type='pg'}, cluster_name)", "refId": "StandardVariableQuery" }, - "refresh": 0, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/grafana/common/QueryStatistics.json b/grafana/common/QueryStatistics.json index 908d7b32..fd46cb7c 100644 --- a/grafana/common/QueryStatistics.json +++ b/grafana/common/QueryStatistics.json @@ -736,7 +736,7 @@ "query": "label_values(up{exp_type='pg'}, cluster_name)", "refId": "StandardVariableQuery" }, - "refresh": 0, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -803,7 +803,7 @@ "query": "label_values(ccp_pg_stat_statements_total_calls_count{cluster_name=\"[[pgcluster]]\", job=\"[[pgnodes]]\"}, dbname) ", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, @@ -838,7 +838,7 @@ "query": "label_values(ccp_pg_stat_statements_total_calls_count{cluster_name=\"[[pgcluster]]\", job=\"[[pgnodes]]\",dbname=~\"[[pgdatabase]]\"}, role) ", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index e9d5a6e5..3e5fb502 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -12,14 +12,20 @@ weight: 5 * The pgBackRest Grafana dashboard now presents data on a per-stanza basis * Removed deprecated node_exporter metrics from Grafana OS Details dashboard. Reorganized panels. * Added a basic Network Activity dashboard to Grafana using default metrics that come with node_exporter. - * The pgMonitor repository has been reorganized to better delineate which platforms specific files apply to. Some files have also been renamed as part of this reorganization. + * The pgMonitor repository has been reorganized around which platforms files apply to. Some files have also been renamed as part of this reorganization. * Extended the default alert threshold for pgBackRest backups to give a buffer time and avoid false positives when backup runtimes vary. * Added a default alert for PostgreSQL failover that should work in any scenario to produce an alert when the recovery status of a PostgreSQL database changes (replica -> primary or primary -> replica). Note that this alert will auto-resolve after 5 minutes (by default) since it is just looking for recent state changes. The alert is meant to be acted upon immediately to see what may have occured on the systems involved. * Added metric to monitor and alert on blocked queries ### Bug Fixes + * Fixed several incorrect metric names in alert expressions for the example alert files. Please review all alerts to ensure your expressions are checking the correct metrics, making special note of the following: + * PGSettingsChecksum + * PGDBSize + * PGReplicationByteLag + * MemoryAvailable + * SwapUsage + * All pgBackRest alerts * Fixed pgBackRest metrics not reporting all backups in all stanzas for a given repository in some configuration setups. Each database will now only report back monitoring for the stanzas that are part of its own instance. Previously all database instances reported back all stanzas in the target repository. - * Fixed incorrect metric name in warning alert for available memory in linux/node_exporter default alerts (node_memory_Available_bytes should be node_memory_MemAvailable_bytes) * Fixed incorrect title of panel on Grafana PostgreSQL Details dashboard from "Transactions Per Minute" to "Transactions Per Second". ### Manual Intervention Changes diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 249f8b28..131b0e98 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -398,7 +398,9 @@ The following metrics either require special considerations when monitoring spec * *ccp_connection_stats_max_blocked_query_time* - Runtime of the longest running query that has been blocked by a heavyweight lock - * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time the replica replayed the WAL vs what it has received. It is a secondary metric for monitoring WAL replay on the replica itself. + * *ccp_replication_lag_replay_time* - Time since a replica received and replayed a WAL file; only shown on replica instances. Note that this is not the main way to determine if a replica is behind its primary. This metric only monitors the time since the replica replayed the WAL vs when it was received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. + + * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases between replay of WAL files. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes (which means there is no WAL to send). * *ccp_replication_lag_size_bytes* - Only provides values on instances that have attached replicas (primary, cascading replica). Tracks byte lag of every streaming replica connected to this database instance. This is the main way that replication lag is monitored. Note that if you have WAL replay only replicas, this will not be reflected here. diff --git a/postgres_exporter/common/pg10/queries_general.yml b/postgres_exporter/common/pg10/queries_general.yml index 27dc9164..6d8816ac 100644 --- a/postgres_exporter/common/pg10/queries_general.yml +++ b/postgres_exporter/common/pg10/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg10/setup.sql b/postgres_exporter/common/pg10/setup.sql index dfc29a34..a52d0124 100644 --- a/postgres_exporter/common/pg10/setup.sql +++ b/postgres_exporter/common/pg10/setup.sql @@ -18,7 +18,6 @@ END $$; GRANT pg_monitor to ccp_monitoring; -GRANT pg_execute_server_program TO ccp_monitoring; ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; @@ -32,7 +31,7 @@ ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, au DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql + LANGUAGE plpgsql SECURITY DEFINER SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg11/queries_general.yml b/postgres_exporter/common/pg11/queries_general.yml index 8505dfbf..9995fc5a 100644 --- a/postgres_exporter/common/pg11/queries_general.yml +++ b/postgres_exporter/common/pg11/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg11/setup.sql b/postgres_exporter/common/pg11/setup.sql index f1f6ef75..3da50ee2 100644 --- a/postgres_exporter/common/pg11/setup.sql +++ b/postgres_exporter/common/pg11/setup.sql @@ -18,6 +18,7 @@ END $$; GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; @@ -31,7 +32,7 @@ ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, au DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql SECURITY DEFINER + LANGUAGE plpgsql SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -73,7 +74,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) - LANGUAGE sql SECURITY DEFINER + LANGUAGE sql SECURITY DEFINER STABLE SET search_path TO pg_catalog, pg_temp AS $function$ @@ -131,7 +132,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER + LANGUAGE sql SECURITY DEFINER STABLE SET search_path TO pg_catalog, pg_temp AS $function$ diff --git a/postgres_exporter/common/pg12/queries_general.yml b/postgres_exporter/common/pg12/queries_general.yml index d133567d..209fff63 100644 --- a/postgres_exporter/common/pg12/queries_general.yml +++ b/postgres_exporter/common/pg12/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg12/setup.sql b/postgres_exporter/common/pg12/setup.sql index 0c8aa531..7e040eb0 100644 --- a/postgres_exporter/common/pg12/setup.sql +++ b/postgres_exporter/common/pg12/setup.sql @@ -74,7 +74,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) - LANGUAGE sql SECURITY DEFINER + LANGUAGE sql SECURITY DEFINER STABLE SET search_path TO pg_catalog, pg_temp AS $function$ @@ -132,7 +132,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER + LANGUAGE sql SECURITY DEFINER STABLE SET search_path TO pg_catalog, pg_temp AS $function$ @@ -426,6 +426,7 @@ CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer RETURNS bigint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg13/queries_general.yml b/postgres_exporter/common/pg13/queries_general.yml index 8c2c83ef..39b97d69 100644 --- a/postgres_exporter/common/pg13/queries_general.yml +++ b/postgres_exporter/common/pg13/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg13/setup.sql b/postgres_exporter/common/pg13/setup.sql index 1295e549..1b21e05f 100644 --- a/postgres_exporter/common/pg13/setup.sql +++ b/postgres_exporter/common/pg13/setup.sql @@ -426,6 +426,7 @@ CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer RETURNS bigint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg96/queries_general.yml b/postgres_exporter/common/pg96/queries_general.yml index bb782178..64101269 100644 --- a/postgres_exporter/common/pg96/queries_general.yml +++ b/postgres_exporter/common/pg96/queries_general.yml @@ -55,11 +55,16 @@ ccp_replication_lag: WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/queries_nodemx.yml b/postgres_exporter/common/queries_nodemx.yml index 93d5cffc..fd3d6ca4 100644 --- a/postgres_exporter/common/queries_nodemx.yml +++ b/postgres_exporter/common/queries_nodemx.yml @@ -52,9 +52,6 @@ ccp_nodemx_mem: - mapped_file: usage: "GAUGE" description: "Total bytes of mapped file (includes tmpfs/shmem)" - - mapped_file: - usage: "GAUGE" - description: "Total bytes of mapped file (includes tmpfs/shmem)" - dirty: usage: "GAUGE" description: "Total bytes that are waiting to get written back to the disk" diff --git a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example index bba97f7b..fc6c41ac 100644 --- a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example +++ b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example @@ -92,7 +92,7 @@ groups: # If metric returns 1, then pg_settings have changed since last known valid state # To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state. # - alert: PGSettingsChecksum -# expr: ccp_pg_settings_checksum > 0 +# expr: ccp_pg_settings_checksum_status > 0 # for 60s # labels: # service: postgresql @@ -105,7 +105,7 @@ groups: # Monitor for data block checksum failures. Only works in PG12+ # - alert: PGDataChecksum -# expr: ccp_data_checksum_failure > 0 +# expr: ccp_data_checksum_failure_count > 0 # for 60s # labels: # service: postgresql @@ -182,7 +182,7 @@ groups: summary: 'PGSQL Instance connections' - alert: PGDBSize - expr: ccp_database_size > 1.073741824e+11 + expr: ccp_database_size_bytes > 1.073741824e+11 for: 60s labels: service: postgresql @@ -193,7 +193,7 @@ groups: summary: 'PGSQL Instance size warning' - alert: PGDBSize - expr: ccp_database_size > 2.68435456e+11 + expr: ccp_database_size_bytes > 2.68435456e+11 for: 60s labels: service: postgresql @@ -204,7 +204,7 @@ groups: summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 5.24288e+07 + expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql @@ -215,7 +215,7 @@ groups: summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 1.048576e+08 + expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql @@ -321,9 +321,9 @@ groups: # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: -# ccp_backrest_last_full_time_since_completion_seconds -# ccp_backrest_last_incr_time_since_completion_seconds -# ccp_backrest_last_diff_time_since_completion_seconds +# ccp_backrest_last_full_backup_time_since_completion_seconds +# ccp_backrest_last_incr_backup_time_since_completion_seconds +# ccp_backrest_last_diff_backup_time_since_completion_seconds # # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. # Further adjustment may be needed depending on your backup runtimes/schedule. @@ -351,14 +351,14 @@ groups: # # Runtime monitoring is handled with a single metric: # -# ccp_backrest_last_runtime_backup_runtime_seconds +# ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql @@ -368,7 +368,7 @@ groups: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql diff --git a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example index 23126b71..86900569 100644 --- a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example +++ b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example @@ -20,6 +20,19 @@ groups: summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' +########## SYSTEM RULES ########## + - alert: ExporterDown + expr: avg_over_time(up[5m]) < 0.5 + for: 10s + labels: + service: system + severity: critical + severity_num: 300 + annotations: + description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.' + summary: 'Prometheus Exporter Service Down' + + ########## POSTGRESQL RULES ########## - alert: PGIsUp expr: pg_up < 1 @@ -193,7 +206,7 @@ groups: summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 5.24288e+07 + expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql @@ -204,7 +217,7 @@ groups: summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 1.048576e+08 + expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql @@ -310,12 +323,15 @@ groups: # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: -# ccp_backrest_last_full_time_since_completion_seconds -# ccp_backrest_last_incr_time_since_completion_seconds -# ccp_backrest_last_diff_time_since_completion_seconds +# ccp_backrest_last_full_backup_time_since_completion_seconds +# ccp_backrest_last_incr_backup_time_since_completion_seconds +# ccp_backrest_last_diff_backup_time_since_completion_seconds +# +# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. +# Further adjustment may be needed depending on your backup runtimes/schedule. # # - alert: PGBackRestLastCompletedFull_main -# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800 +# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000 # for: 60s # labels: # service: postgresql @@ -325,7 +341,7 @@ groups: # summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.' # # - alert: PGBackRestLastCompletedIncr_main -# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400 +# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600 # for: 60s # labels: # service: postgresql @@ -337,14 +353,14 @@ groups: # # Runtime monitoring is handled with a single metric: # -# ccp_backrest_last_runtime_backup_runtime_seconds +# ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql @@ -354,7 +370,7 @@ groups: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql diff --git a/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example b/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example index 101ee192..1723055c 100644 --- a/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example +++ b/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example @@ -103,7 +103,7 @@ groups: description: 'Swap usage for target {{ $labels.job }} is at {{ $value }}%' - alert: SwapUsage - expr: (100 - (100 * (node_memory_SwapFree_byte / node_memory_SwapTotal_bytes))) > 80 + expr: (100 - (100 * (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))) > 80 for: 1m labels: service: system