From ffee967a7e06aff0b4d4171d0a5f407404ad2edb Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Tue, 4 May 2021 17:11:51 -0400 Subject: [PATCH 01/11] Fix search path in monitoring functions --- postgres_exporter/common/pg10/setup.sql | 7 ++++++- postgres_exporter/common/pg11/setup.sql | 10 ++++++++-- postgres_exporter/common/pg12/setup.sql | 11 +++++++++-- postgres_exporter/common/pg13/setup.sql | 11 +++++++++-- postgres_exporter/common/pg96/setup.sql | 7 +++++++ 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/postgres_exporter/common/pg10/setup.sql b/postgres_exporter/common/pg10/setup.sql index 1b1d2df7..a52d0124 100644 --- a/postgres_exporter/common/pg10/setup.sql +++ b/postgres_exporter/common/pg10/setup.sql @@ -32,6 +32,7 @@ DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -73,6 +74,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -129,7 +131,8 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER STABLE + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -187,6 +190,7 @@ DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -266,6 +270,7 @@ DROP FUNCTION IF EXISTS monitor.pg_hba_checksum(text); CREATE FUNCTION monitor.pg_hba_checksum(p_known_hba_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg11/setup.sql b/postgres_exporter/common/pg11/setup.sql index 1ce8e4f2..3da50ee2 100644 --- a/postgres_exporter/common/pg11/setup.sql +++ b/postgres_exporter/common/pg11/setup.sql @@ -18,6 +18,7 @@ END $$; GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; @@ -31,7 +32,8 @@ ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, au DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql SECURITY DEFINER + LANGUAGE plpgsql + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -73,6 +75,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -129,7 +132,8 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER STABLE + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -187,6 +191,7 @@ DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -266,6 +271,7 @@ DROP FUNCTION IF EXISTS monitor.pg_hba_checksum(text); CREATE FUNCTION monitor.pg_hba_checksum(p_known_hba_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg12/setup.sql b/postgres_exporter/common/pg12/setup.sql index d73f9257..7e040eb0 100644 --- a/postgres_exporter/common/pg12/setup.sql +++ b/postgres_exporter/common/pg12/setup.sql @@ -18,6 +18,7 @@ END $$; GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; @@ -31,7 +32,8 @@ ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, au DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql SECURITY DEFINER + LANGUAGE plpgsql + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -73,6 +75,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -129,7 +132,8 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER STABLE + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -187,6 +191,7 @@ DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -266,6 +271,7 @@ DROP FUNCTION IF EXISTS monitor.pg_hba_checksum(text); CREATE FUNCTION monitor.pg_hba_checksum(p_known_hba_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -420,6 +426,7 @@ CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer RETURNS bigint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg13/setup.sql b/postgres_exporter/common/pg13/setup.sql index f1e98dec..1b21e05f 100644 --- a/postgres_exporter/common/pg13/setup.sql +++ b/postgres_exporter/common/pg13/setup.sql @@ -18,6 +18,7 @@ END $$; GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; @@ -31,7 +32,8 @@ ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, au DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info - LANGUAGE plpgsql SECURITY DEFINER + LANGUAGE plpgsql + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -73,6 +75,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -129,7 +132,8 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) - LANGUAGE sql SECURITY DEFINER STABLE + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp AS $function$ /* @@ -187,6 +191,7 @@ DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -266,6 +271,7 @@ DROP FUNCTION IF EXISTS monitor.pg_hba_checksum(text); CREATE FUNCTION monitor.pg_hba_checksum(p_known_hba_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -420,6 +426,7 @@ CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer RETURNS bigint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE diff --git a/postgres_exporter/common/pg96/setup.sql b/postgres_exporter/common/pg96/setup.sql index b99a0ed1..41ee0668 100644 --- a/postgres_exporter/common/pg96/setup.sql +++ b/postgres_exporter/common/pg96/setup.sql @@ -23,6 +23,7 @@ CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring; DROP FUNCTION IF EXISTS monitor.pg_stat_activity(); CREATE OR REPLACE FUNCTION monitor.pg_stat_activity() RETURNS SETOF pg_catalog.pg_stat_activity LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $$ BEGIN RETURN query(SELECT * FROM pg_catalog.pg_stat_activity); @@ -35,6 +36,7 @@ REVOKE ALL ON FUNCTION monitor.pg_stat_activity() FROM PUBLIC; DROP FUNCTION IF EXISTS monitor.streaming_replica_check(); CREATE OR REPLACE FUNCTION monitor.streaming_replica_check() RETURNS TABLE (replica_hostname text, replica_addr inet, replica_port int, byte_lag numeric) LANGUAGE SQL SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $$ SELECT client_hostname as replica_hostname , client_addr as replica_addr @@ -51,6 +53,7 @@ DROP FUNCTION IF EXISTS monitor.pg_ls_wal_dir(text); CREATE OR REPLACE FUNCTION monitor.pg_ls_waldir() RETURNS SETOF TEXT LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp as $$ BEGIN IF current_setting('server_version_num')::int >= 100000 THEN @@ -72,6 +75,7 @@ DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -113,6 +117,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_status(); CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent bigint, cycle boolean, numleft numeric, table_usage text) LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -222,6 +227,7 @@ $function$; DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); CREATE FUNCTION monitor.sequence_exhaustion(p_percent int DEFAULT 75, out count bigint) LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE @@ -300,6 +306,7 @@ DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) RETURNS smallint LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp AS $function$ DECLARE From 865b57efd6b1b1327b161c878f7b454ffe4c8bb7 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Fri, 7 May 2021 10:59:10 -0400 Subject: [PATCH 02/11] Add new metric ccp_replication_lag_received_time --- hugo/content/exporter/_index.md | 4 +++- postgres_exporter/common/pg10/queries_general.yml | 9 +++++++-- postgres_exporter/common/pg11/queries_general.yml | 9 +++++++-- postgres_exporter/common/pg12/queries_general.yml | 9 +++++++-- postgres_exporter/common/pg13/queries_general.yml | 9 +++++++-- postgres_exporter/common/pg96/queries_general.yml | 9 +++++++-- 6 files changed, 38 insertions(+), 11 deletions(-) diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 2d7351c0..523cc889 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -463,7 +463,9 @@ The following metrics either require special considerations when monitoring spec * *ccp_connection_stats_max_blocked_query_time* - Runtime of the longest running query that has been blocked by a heavyweight lock - * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time the replica replayed the WAL vs what it has received. It is a secondary metric for monitoring WAL replay on the replica itself. + * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time since the replica replayed the WAL vs what it has received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. + + * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases until the replica actually receives and replays a WAL file. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes. * *ccp_replication_lag_size_bytes* - Only provides values on instances that have attached replicas (primary, cascading replica). Tracks byte lag of every streaming replica connected to this database instance. This is the main way that replication lag is monitored. Note that if you have WAL replay only replicas, this will not be reflected here. diff --git a/postgres_exporter/common/pg10/queries_general.yml b/postgres_exporter/common/pg10/queries_general.yml index 27dc9164..c4a09049 100644 --- a/postgres_exporter/common/pg10/queries_general.yml +++ b/postgres_exporter/common/pg10/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg11/queries_general.yml b/postgres_exporter/common/pg11/queries_general.yml index 8505dfbf..1d1f3957 100644 --- a/postgres_exporter/common/pg11/queries_general.yml +++ b/postgres_exporter/common/pg11/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg12/queries_general.yml b/postgres_exporter/common/pg12/queries_general.yml index d133567d..12795c32 100644 --- a/postgres_exporter/common/pg12/queries_general.yml +++ b/postgres_exporter/common/pg12/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg13/queries_general.yml b/postgres_exporter/common/pg13/queries_general.yml index 8c2c83ef..66e31d2f 100644 --- a/postgres_exporter/common/pg13/queries_general.yml +++ b/postgres_exporter/common/pg13/queries_general.yml @@ -53,11 +53,16 @@ ccp_replication_lag: WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: diff --git a/postgres_exporter/common/pg96/queries_general.yml b/postgres_exporter/common/pg96/queries_general.yml index bb782178..5968dbbd 100644 --- a/postgres_exporter/common/pg96/queries_general.yml +++ b/postgres_exporter/common/pg96/queries_general.yml @@ -55,11 +55,16 @@ ccp_replication_lag: WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER END - AS replay_time" + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" metrics: - replay_time: usage: "GAUGE" - description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary." + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: From fb4e99cb7611b35bd5c81e176ccd399009a91c6d Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Mon, 10 May 2021 09:13:57 -0400 Subject: [PATCH 03/11] Update hugo/content/exporter/_index.md Co-authored-by: Douglas J Hunley --- hugo/content/exporter/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 523cc889..3ae3a1e5 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -463,7 +463,7 @@ The following metrics either require special considerations when monitoring spec * *ccp_connection_stats_max_blocked_query_time* - Runtime of the longest running query that has been blocked by a heavyweight lock - * *ccp_replication_lag_replay_time* - Only provides values on replica instances. Time since replica received and replayed a WAL file. Note this is not the main way to determine if a replica is behind its primary. It only monitors the time since the replica replayed the WAL vs what it has received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. + * *ccp_replication_lag_replay_time* - Time since replica received and replayed a WAL file; only shown on replica instances. Note that this is not the main way to determine if a replica is behind its primary. This metric only monitors the time since the replica replayed the WAL vs when it was received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases until the replica actually receives and replays a WAL file. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes. From 9d9b1bfb5c8e6a413246b75141a46d78d0f47123 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Thu, 13 May 2021 09:40:05 -0400 Subject: [PATCH 04/11] Fix all dashboards to reload variables on load --- grafana/common/Bloat_Details.json | 2 +- grafana/common/QueryStatistics.json | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/grafana/common/Bloat_Details.json b/grafana/common/Bloat_Details.json index a9419dc6..0b6efcfb 100644 --- a/grafana/common/Bloat_Details.json +++ b/grafana/common/Bloat_Details.json @@ -362,7 +362,7 @@ "query": "label_values(up{exp_type='pg'}, cluster_name)", "refId": "StandardVariableQuery" }, - "refresh": 0, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/grafana/common/QueryStatistics.json b/grafana/common/QueryStatistics.json index 908d7b32..fd46cb7c 100644 --- a/grafana/common/QueryStatistics.json +++ b/grafana/common/QueryStatistics.json @@ -736,7 +736,7 @@ "query": "label_values(up{exp_type='pg'}, cluster_name)", "refId": "StandardVariableQuery" }, - "refresh": 0, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -803,7 +803,7 @@ "query": "label_values(ccp_pg_stat_statements_total_calls_count{cluster_name=\"[[pgcluster]]\", job=\"[[pgnodes]]\"}, dbname) ", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, @@ -838,7 +838,7 @@ "query": "label_values(ccp_pg_stat_statements_total_calls_count{cluster_name=\"[[pgcluster]]\", job=\"[[pgnodes]]\",dbname=~\"[[pgdatabase]]\"}, role) ", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, From d080115933d49304152a107228a7a5a1b17b52ec Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Fri, 21 May 2021 12:23:39 -0400 Subject: [PATCH 05/11] Apply suggestions from code review Co-authored-by: Douglas J Hunley --- hugo/content/exporter/_index.md | 2 +- postgres_exporter/common/pg10/queries_general.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 3ae3a1e5..df9a9e13 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -465,7 +465,7 @@ The following metrics either require special considerations when monitoring spec * *ccp_replication_lag_replay_time* - Time since replica received and replayed a WAL file; only shown on replica instances. Note that this is not the main way to determine if a replica is behind its primary. This metric only monitors the time since the replica replayed the WAL vs when it was received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. - * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases until the replica actually receives and replays a WAL file. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes. + * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases between receipt of WAL files. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes (which means there is no WAL to send). * *ccp_replication_lag_size_bytes* - Only provides values on instances that have attached replicas (primary, cascading replica). Tracks byte lag of every streaming replica connected to this database instance. This is the main way that replication lag is monitored. Note that if you have WAL replay only replicas, this will not be reflected here. diff --git a/postgres_exporter/common/pg10/queries_general.yml b/postgres_exporter/common/pg10/queries_general.yml index c4a09049..51cbdaaa 100644 --- a/postgres_exporter/common/pg10/queries_general.yml +++ b/postgres_exporter/common/pg10/queries_general.yml @@ -62,7 +62,7 @@ ccp_replication_lag: description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." - received_time: usage: "GAUGE" - description: "Length of time since the last WAL file was received and replayed on replica. Always increases, even if primary stops writing. Monitors for replicas that stop receiving WAL all together." + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." ccp_replication_lag_size: From 29c75fa4e995ec74296efac4ed86b92f3916cf8c Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Wed, 26 May 2021 13:50:20 -0400 Subject: [PATCH 06/11] Apply suggestions from code review Co-authored-by: Douglas J Hunley --- hugo/content/exporter/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugo/content/exporter/_index.md b/hugo/content/exporter/_index.md index 30a9f324..eaef8c7a 100644 --- a/hugo/content/exporter/_index.md +++ b/hugo/content/exporter/_index.md @@ -463,7 +463,7 @@ The following metrics either require special considerations when monitoring spec * *ccp_connection_stats_max_blocked_query_time* - Runtime of the longest running query that has been blocked by a heavyweight lock - * *ccp_replication_lag_replay_time* - Time since replica received and replayed a WAL file; only shown on replica instances. Note that this is not the main way to determine if a replica is behind its primary. This metric only monitors the time since the replica replayed the WAL vs when it was received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. + * *ccp_replication_lag_replay_time* - Time since a replica received and replayed a WAL file; only shown on replica instances. Note that this is not the main way to determine if a replica is behind its primary. This metric only monitors the time since the replica replayed the WAL vs when it was received. It also does not monitor when a WAL replay replica completely stops receiving WAL (see received_time metric). It is a secondary metric for monitoring WAL replay on the replica itself. * *ccp_replication_lag_received_time* - Similar to *ccp_replication_lag_replay_time*, however this value always increases between replay of WAL files. Effective for monitoring that a WAL replay replica has actually received WAL files. Note this will cause false positives when used as an alert for replica lag if the primary receives no writes (which means there is no WAL to send). From 6c80ef511e238feb1a3aa91f2d08f91fc50da492 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Tue, 1 Jun 2021 09:56:22 -0400 Subject: [PATCH 07/11] Fix incorrect metric names in alert rules --- .../crunchy-alert-rules-pg.yml.example | 24 +++++++++---------- ...nchy-alert-rules-pg.yml.containers.example | 23 ++++++++++-------- .../crunchy-alert-rules-node.yml.example | 2 +- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example index bba97f7b..fc6c41ac 100644 --- a/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example +++ b/prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example @@ -92,7 +92,7 @@ groups: # If metric returns 1, then pg_settings have changed since last known valid state # To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state. # - alert: PGSettingsChecksum -# expr: ccp_pg_settings_checksum > 0 +# expr: ccp_pg_settings_checksum_status > 0 # for 60s # labels: # service: postgresql @@ -105,7 +105,7 @@ groups: # Monitor for data block checksum failures. Only works in PG12+ # - alert: PGDataChecksum -# expr: ccp_data_checksum_failure > 0 +# expr: ccp_data_checksum_failure_count > 0 # for 60s # labels: # service: postgresql @@ -182,7 +182,7 @@ groups: summary: 'PGSQL Instance connections' - alert: PGDBSize - expr: ccp_database_size > 1.073741824e+11 + expr: ccp_database_size_bytes > 1.073741824e+11 for: 60s labels: service: postgresql @@ -193,7 +193,7 @@ groups: summary: 'PGSQL Instance size warning' - alert: PGDBSize - expr: ccp_database_size > 2.68435456e+11 + expr: ccp_database_size_bytes > 2.68435456e+11 for: 60s labels: service: postgresql @@ -204,7 +204,7 @@ groups: summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 5.24288e+07 + expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql @@ -215,7 +215,7 @@ groups: summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 1.048576e+08 + expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql @@ -321,9 +321,9 @@ groups: # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: -# ccp_backrest_last_full_time_since_completion_seconds -# ccp_backrest_last_incr_time_since_completion_seconds -# ccp_backrest_last_diff_time_since_completion_seconds +# ccp_backrest_last_full_backup_time_since_completion_seconds +# ccp_backrest_last_incr_backup_time_since_completion_seconds +# ccp_backrest_last_diff_backup_time_since_completion_seconds # # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. # Further adjustment may be needed depending on your backup runtimes/schedule. @@ -351,14 +351,14 @@ groups: # # Runtime monitoring is handled with a single metric: # -# ccp_backrest_last_runtime_backup_runtime_seconds +# ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql @@ -368,7 +368,7 @@ groups: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql diff --git a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example index 23126b71..bad39e92 100644 --- a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example +++ b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example @@ -193,7 +193,7 @@ groups: summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 5.24288e+07 + expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql @@ -204,7 +204,7 @@ groups: summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 1.048576e+08 + expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql @@ -310,12 +310,15 @@ groups: # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: -# ccp_backrest_last_full_time_since_completion_seconds -# ccp_backrest_last_incr_time_since_completion_seconds -# ccp_backrest_last_diff_time_since_completion_seconds +# ccp_backrest_last_full_backup_time_since_completion_seconds +# ccp_backrest_last_incr_backup_time_since_completion_seconds +# ccp_backrest_last_diff_backup_time_since_completion_seconds +# +# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. +# Further adjustment may be needed depending on your backup runtimes/schedule. # # - alert: PGBackRestLastCompletedFull_main -# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800 +# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000 # for: 60s # labels: # service: postgresql @@ -325,7 +328,7 @@ groups: # summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.' # # - alert: PGBackRestLastCompletedIncr_main -# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400 +# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600 # for: 60s # labels: # service: postgresql @@ -337,14 +340,14 @@ groups: # # Runtime monitoring is handled with a single metric: # -# ccp_backrest_last_runtime_backup_runtime_seconds +# ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql @@ -354,7 +357,7 @@ groups: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main -# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 +# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql diff --git a/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example b/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example index 101ee192..1723055c 100644 --- a/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example +++ b/prometheus/linux/alert-rules.d/crunchy-alert-rules-node.yml.example @@ -103,7 +103,7 @@ groups: description: 'Swap usage for target {{ $labels.job }} is at {{ $value }}%' - alert: SwapUsage - expr: (100 - (100 * (node_memory_SwapFree_byte / node_memory_SwapTotal_bytes))) > 80 + expr: (100 - (100 * (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))) > 80 for: 1m labels: service: system From becf428f5bce8604358213db2041c0c93882d164 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Thu, 3 Jun 2021 10:40:23 -0400 Subject: [PATCH 08/11] Update changelog --- hugo/content/changelog/_index.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index e9d5a6e5..18d2b702 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -12,14 +12,20 @@ weight: 5 * The pgBackRest Grafana dashboard now presents data on a per-stanza basis * Removed deprecated node_exporter metrics from Grafana OS Details dashboard. Reorganized panels. * Added a basic Network Activity dashboard to Grafana using default metrics that come with node_exporter. - * The pgMonitor repository has been reorganized to better delineate which platforms specific files apply to. Some files have also been renamed as part of this reorganization. + * The pgMonitor repository has been reorganized to better around which platforms files apply to . Some files have also been renamed as part of this reorganization. * Extended the default alert threshold for pgBackRest backups to give a buffer time and avoid false positives when backup runtimes vary. * Added a default alert for PostgreSQL failover that should work in any scenario to produce an alert when the recovery status of a PostgreSQL database changes (replica -> primary or primary -> replica). Note that this alert will auto-resolve after 5 minutes (by default) since it is just looking for recent state changes. The alert is meant to be acted upon immediately to see what may have occured on the systems involved. * Added metric to monitor and alert on blocked queries ### Bug Fixes + * Fixed several incorrect metric names in alert expressions for the example alert files. Please review all alerts to ensure your expressions are checking the correct metrics, making special note of the following: + * PGSettingsChecksum + * PGDBSize + * PGReplicationByteLag + * MemoryAvailable + * SwapUsage + * All pgBackRest alerts * Fixed pgBackRest metrics not reporting all backups in all stanzas for a given repository in some configuration setups. Each database will now only report back monitoring for the stanzas that are part of its own instance. Previously all database instances reported back all stanzas in the target repository. - * Fixed incorrect metric name in warning alert for available memory in linux/node_exporter default alerts (node_memory_Available_bytes should be node_memory_MemAvailable_bytes) * Fixed incorrect title of panel on Grafana PostgreSQL Details dashboard from "Transactions Per Minute" to "Transactions Per Second". ### Manual Intervention Changes From 6f3f25e712fab71a912109ae2a467751360a23a5 Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Fri, 4 Jun 2021 10:44:08 -0400 Subject: [PATCH 09/11] Update hugo/content/changelog/_index.md Co-authored-by: Douglas J Hunley --- hugo/content/changelog/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index 18d2b702..3e5fb502 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -12,7 +12,7 @@ weight: 5 * The pgBackRest Grafana dashboard now presents data on a per-stanza basis * Removed deprecated node_exporter metrics from Grafana OS Details dashboard. Reorganized panels. * Added a basic Network Activity dashboard to Grafana using default metrics that come with node_exporter. - * The pgMonitor repository has been reorganized to better around which platforms files apply to . Some files have also been renamed as part of this reorganization. + * The pgMonitor repository has been reorganized around which platforms files apply to. Some files have also been renamed as part of this reorganization. * Extended the default alert threshold for pgBackRest backups to give a buffer time and avoid false positives when backup runtimes vary. * Added a default alert for PostgreSQL failover that should work in any scenario to produce an alert when the recovery status of a PostgreSQL database changes (replica -> primary or primary -> replica). Note that this alert will auto-resolve after 5 minutes (by default) since it is just looking for recent state changes. The alert is meant to be acted upon immediately to see what may have occured on the systems involved. * Added metric to monitor and alert on blocked queries From 6300b7ebc6700c04023a2f2843ca98ad7341956d Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Mon, 14 Jun 2021 09:26:39 -0400 Subject: [PATCH 10/11] Fix duplicate label in ccp_nodemx_mem --- postgres_exporter/common/queries_nodemx.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/postgres_exporter/common/queries_nodemx.yml b/postgres_exporter/common/queries_nodemx.yml index 93d5cffc..fd3d6ca4 100644 --- a/postgres_exporter/common/queries_nodemx.yml +++ b/postgres_exporter/common/queries_nodemx.yml @@ -52,9 +52,6 @@ ccp_nodemx_mem: - mapped_file: usage: "GAUGE" description: "Total bytes of mapped file (includes tmpfs/shmem)" - - mapped_file: - usage: "GAUGE" - description: "Total bytes of mapped file (includes tmpfs/shmem)" - dirty: usage: "GAUGE" description: "Total bytes that are waiting to get written back to the disk" From 91b9cb44e0b41c6dbb9c9e67f74ab1566827d93a Mon Sep 17 00:00:00 2001 From: Keith Fiske Date: Tue, 22 Jun 2021 16:25:00 -0400 Subject: [PATCH 11/11] Add exporter down rule to container example rules file --- .../crunchy-alert-rules-pg.yml.containers.example | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example index bad39e92..86900569 100644 --- a/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example +++ b/prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example @@ -20,6 +20,19 @@ groups: summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' +########## SYSTEM RULES ########## + - alert: ExporterDown + expr: avg_over_time(up[5m]) < 0.5 + for: 10s + labels: + service: system + severity: critical + severity_num: 300 + annotations: + description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.' + summary: 'Prometheus Exporter Service Down' + + ########## POSTGRESQL RULES ########## - alert: PGIsUp expr: pg_up < 1