diff --git a/hugo/content/changelog/_index.md b/hugo/content/changelog/_index.md index 3e5fb502..cd8d5d06 100644 --- a/hugo/content/changelog/_index.md +++ b/hugo/content/changelog/_index.md @@ -6,7 +6,8 @@ weight: 5 ## 4.5 ### New Features - * Minimum required version of Grafana has been updated to 7.4.x + * Add preliminary support for PostgreSQL 14 + * Minimum required version of Grafana has been updated to 7.5.x * Updated Grafana Overview dashboards to support new Stat panel * Updated PostgreSQL Details Grafana dashboard with more information and to be able to present data grouped by clusters. The pgBackRest panel was removed from this dashboard. * The pgBackRest Grafana dashboard now presents data on a per-stanza basis diff --git a/postgres_exporter/common/pg14/queries_general.yml b/postgres_exporter/common/pg14/queries_general.yml new file mode 100644 index 00000000..17265cbd --- /dev/null +++ b/postgres_exporter/common/pg14/queries_general.yml @@ -0,0 +1,145 @@ +### +# +# Begin File: PG14 queries_general.yml +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +ccp_connection_stats: + query: "select ((total - idle) - idle_in_txn) as active + , total + , idle + , idle_in_txn + , (select coalesce(extract(epoch from (max(now() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time + , (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time + , (select coalesce(extract(epoch from (max(now() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time + , max_connections + from ( + select count(*) as total + , coalesce(sum(case when state = 'idle' then 1 else 0 end),0) as idle + , coalesce(sum(case when state = 'idle in transaction' then 1 else 0 end),0) as idle_in_txn from pg_catalog.pg_stat_activity) x + join (select setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);" + metrics: + - active: + usage: "GAUGE" + description: "Total non-idle connections" + - total: + usage: "GAUGE" + description: "Total idle and non-idle connections" + - idle: + usage: "GAUGE" + description: "Total idle connections" + - idle_in_txn: + usage: "GAUGE" + description: "Total idle in transaction connections" + - max_idle_in_txn_time: + usage: "GAUGE" + description: "Length of time in seconds of the longest idle in transaction session" + - max_query_time: + usage: "GAUGE" + description: "Length of time in seconds of the longest running query" + - max_blocked_query_time: + usage: "GAUGE" + description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock" + - max_connections: + usage: "GAUGE" + description: "Value of max_connections for the monitored database" + + +ccp_replication_lag: + query: "SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + END + AS replay_time + , EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER + AS received_time" + metrics: + - replay_time: + usage: "GAUGE" + description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation." + - received_time: + usage: "GAUGE" + description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together." + + +ccp_replication_lag_size: + query: "SELECT client_addr as replica + , client_hostname as replica_hostname + , client_port as replica_port + , pg_wal_lsn_diff(sent_lsn, replay_lsn) as bytes + FROM pg_catalog.pg_stat_replication" + metrics: + - replica: + usage: "LABEL" + description: "Replica address" + - replica_hostname: + usage: "LABEL" + description: "Replica hostname" + - replica_port: + usage: "LABEL" + description: "Replica port" + - bytes: + usage: "GAUGE" + description: "Replication lag in bytes" + + +ccp_replication_slots: + query: "SELECT slot_name, active::int, pg_wal_lsn_diff(pg_current_wal_insert_lsn(), restart_lsn) AS retained_bytes FROM pg_catalog.pg_replication_slots" + metrics: + - slot_name: + usage: "LABEL" + description: "Name of replication slot" + - active: + usage: "GAUGE" + description: "Active state of slot. 1 = true. 0 = false." + - retained_bytes: + usage: "GAUGE" + description: "The amount of WAL (in bytes) being retained for this slot" + + +ccp_wal_activity: + query: "SELECT last_5_min_size_bytes, + (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes + FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x;" + metrics: + - last_5_min_size_bytes: + usage: "GAUGE" + description: "Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs." + - total_size_bytes: + usage: "GAUGE" + description: "Current size in bytes of the WAL directory" + + +ccp_data_checksum_failure: + query: "SELECT datname AS dbname + , checksum_failures AS count + , coalesce(extract(epoch from (now() - checksum_last_failure)), 0) AS time_since_last_failure_seconds + FROM pg_catalog.pg_stat_database;" + metrics: + - dbname: + usage: "LABEL" + description: "Database name" + - count: + usage: "GAUGE" + description: "Total number of checksum failures on this database" + - time_since_last_failure_seconds: + usage: "GAUGE" + description: "Time interval in seconds since the last checksum failure was encountered" + + +ccp_pg_hba_checksum: + query: "SELECT monitor.pg_hba_checksum() AS status" + metrics: + - status: + usage: "GAUGE" + description: "Value of checksum monitioring status for pg_catalog.pg_hba_file_rules (pg_hba.conf). 0 = valid config. 1 = settings changed. To reset current config to valid after alert, run monitor.pg_hba_checksum_set_valid()." + + +### +# +# End File: PG14 queries_general.yml +# +### diff --git a/postgres_exporter/common/pg14/queries_pg_stat_statements.yml b/postgres_exporter/common/pg14/queries_pg_stat_statements.yml new file mode 100644 index 00000000..924fb3be --- /dev/null +++ b/postgres_exporter/common/pg14/queries_pg_stat_statements.yml @@ -0,0 +1,135 @@ +### +# +# Begin File: PG14 queries_pg_stat_statements.yml +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +ccp_pg_stat_statements_total: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + sum(s.calls) AS calls_count, + sum(s.total_exec_time) AS exec_time_ms, + avg(s.mean_exec_time) AS mean_exec_time_ms, + sum(s.rows) AS row_count + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + GROUP BY 1,2" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - calls_count: + usage: "GAUGE" + description: "Total number of queries run per user/database" + - exec_time_ms: + usage: "GAUGE" + description: "Total runtime of all queries per user/database" + - mean_exec_time_ms: + usage: "GAUGE" + description: "Mean runtime of all queries per user/database" + - row_count: + usage: "GAUGE" + description: "Total rows returned from all queries per user/database" + + +ccp_pg_stat_statements_top_mean: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + max(s.mean_exec_time) exec_time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + GROUP BY 1,2,3,4 + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - exec_time_ms: + usage: "GAUGE" + description: "Average query runtime in milliseconds" + + +# Note that individual query stats can only be reset in PG12 +ccp_pg_stat_statements_top_total: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + s.total_exec_time exec_time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - exec_time_ms: + usage: "GAUGE" + description: "Total time spent in the statement in milliseconds" + + +# Note that individual query stats can only be reset in PG12 +ccp_pg_stat_statements_top_max: + query: "SELECT pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + s.max_exec_time AS exec_time_ms + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid + ORDER BY 5 DESC + LIMIT #PG_STAT_STATEMENTS_LIMIT#" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - exec_time_ms: + usage: "GAUGE" + description: "Maximum time spent in the statement in milliseconds" + + +### +# +# End File: PG14 queries_pg_stat_statements.yml +# +### diff --git a/postgres_exporter/common/pg14/queries_pg_stat_statements_full.yml b/postgres_exporter/common/pg14/queries_pg_stat_statements_full.yml new file mode 100644 index 00000000..e8aba818 --- /dev/null +++ b/postgres_exporter/common/pg14/queries_pg_stat_statements_full.yml @@ -0,0 +1,153 @@ +### +# +# Begin File: PG14 queries_pg_stat_statements_full.yml +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +# Gathers all metrics for per query monitoring. WARNING: This can greatly affect Prometheus storage and performance +ccp_pg_stat_statements: + query: "SELECT + pg_get_userbyid(s.userid) as role, + d.datname AS dbname, + s.queryid, + s.toplevel, + btrim(replace(left(s.query, 40), '\n', '')) AS query, + s.plans, + s.total_plan_time AS total_plan_time_ms, + s.min_plan_time AS min_plan_time_ms, + s.max_plan_time AS max_plan_time_ms, + s.mean_plan_time AS mean_plan_time_ms, + s.stddev_plan_time AS stddev_plan_time_ms, + s.calls, + s.total_exec_time AS total_exec_time_ms, + s.min_exec_time AS min_exec_time_ms, + s.max_exec_time AS max_exec_time_ms, + s.mean_exec_time AS mean_exec_time_ms, + s.stddev_exec_time AS stddev_exec_time_ms, + s.rows, + s.shared_blks_hit, + s.shared_blks_read, + s.shared_blks_dirtied, + s.shared_blks_written, + s.local_blks_hit, + s.local_blks_read, + s.local_blks_dirtied, + s.local_blks_written, + s.temp_blks_read, + s.temp_blks_written, + s.blk_read_time AS blk_read_time_ms, + s.blk_write_time AS blk_write_time_ms, + s.wal_records, + s.wal_fpi, + s.wal_bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON d.oid = s.dbid" + metrics: + - role: + usage: "LABEL" + description: "Role that executed the statement" + - dbname: + usage: "LABEL" + description: "Database in which the statement was executed" + - toplevel: + usage: "LABEL" + description: "True if the query was executed as a top-level statement (always true if pg_stat_statements.track is set to top)" + - queryid: + usage: "LABEL" + description: "Internal hash code, computed from the statement's parse tree" + - query: + usage: "LABEL" + description: "First 40 characters of query text" + - plans: + usage: "GAUGE" + description: "Number of times the statement was planned (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - total_plan_time_ms: + usage: "GAUGE" + description: "Total time spent planning the statement, in milliseconds (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - min_plan_time_ms: + usage: "GAUGE" + description: "Minimum time spent planning the statement, in milliseconds (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - max_plan_time_ms: + usage: "GAUGE" + description: "Maximum time spent planning the statement, in milliseconds (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - mean_plan_time_ms: + usage: "GAUGE" + description: "Mean time spent planning the statement, in milliseconds (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - stddev_plan_time_ms: + usage: "GAUGE" + description: "Population standard deviation of time spent planning the statement, in milliseconds (if pg_stat_statements.track_planning is enabled, otherwise zero)" + - calls: + usage: "GAUGE" + description: "Number of times executed" + - total_exec_time_ms: + usage: "GAUGE" + description: "Total time spent in the statement, in milliseconds" + - min_exec_time_ms: + usage: "GAUGE" + description: "Minimum time spent in the statement, in milliseconds" + - max_exec_time_ms: + usage: "GAUGE" + description: "Maximum time spent in the statement, in milliseconds" + - mean_exec_time_ms: + usage: "GAUGE" + description: "Mean time spent in the statement, in milliseconds" + - stddev_exec_time_ms: + usage: "GAUGE" + description: "Population standard deviation of time spent in the statement, in milliseconds" + - rows: + usage: "GAUGE" + description: "Total number of rows retrieved or affected by the statement" + - shared_blks_hit: + usage: "GAUGE" + description: "Total number of shared block cache hits by the statement" + - shared_blks_read: + usage: "GAUGE" + description: "Total number of shared blocks read by the statement" + - shared_blks_dirtied: + usage: "GAUGE" + description: "Total number of shared blocks dirtied by the statement" + - shared_blks_written: + usage: "GAUGE" + description: "Total number of shared blocks written by the statement" + - local_blks_hit: + usage: "GAUGE" + description: "Total number of local block cache hits by the statement" + - local_blks_read: + usage: "GAUGE" + description: "Total number of local blocks read by the statement" + - local_blks_dirtied: + usage: "GAUGE" + description: "Total number of local blocks dirtied by the statement" + - local_blks_written: + usage: "GAUGE" + description: "Total number of local blocks written by the statement" + - temp_blks_read: + usage: "GAUGE" + description: "Total number of temp blocks read by the statement" + - temp_blks_written: + usage: "GAUGE" + description: "Total number of temp blocks written by the statement" + - blk_read_time_ms: + usage: "GAUGE" + description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + - blk_write_time_ms: + usage: "GAUGE" + description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + - wal_records: + usage: "GAUGE" + description: "Total number of WAL records generated by the statement" + - wal_fpi: + usage: "GAUGE" + description: "Total number of WAL full page images generated by the statement" + - wal_bytes: + usage: "GAUGE" + description: "Total amount of WAL bytes generated by the statement" + +### +# +# End File: PG14 queries_pg_stat_statements_full.yml +# +### diff --git a/postgres_exporter/common/pg14/queries_pg_stat_statements_reset_info.yml b/postgres_exporter/common/pg14/queries_pg_stat_statements_reset_info.yml new file mode 100644 index 00000000..a03af99f --- /dev/null +++ b/postgres_exporter/common/pg14/queries_pg_stat_statements_reset_info.yml @@ -0,0 +1,20 @@ +### +# +# Begin File: pg_stat_statements_reset_info.yml +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### +ccp_pg_stat_statements_reset: + query: "select monitor.pg_stat_statements_reset_info(#PG_STAT_STATEMENTS_THROTTLE_MINUTES#) as time" + metrics: + - time: + usage: "GAUGE" + description: "Epoch time when stats were reset" + +### +# +# End File: pg_stat_statements_reset_info.yml +# +### + diff --git a/postgres_exporter/common/pg14/setup.sql b/postgres_exporter/common/pg14/setup.sql new file mode 100644 index 00000000..ae6391a5 --- /dev/null +++ b/postgres_exporter/common/pg14/setup.sql @@ -0,0 +1,462 @@ +-- PG14 pgMonitor Setup +-- +-- Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +-- + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ccp_monitoring') THEN + CREATE ROLE ccp_monitoring WITH LOGIN; + END IF; + + -- The pgmonitor role is required by the pgnodemx extension in PostgreSQL versions 9.5 and 9.6 + -- and should be removed when upgrading to PostgreSQL 10 and above. + IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pgmonitor') THEN + DROP ROLE pgmonitor; + END IF; +END +$$; + +GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; + +ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; + +CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring; + +DROP TABLE IF EXISTS monitor.pgbackrest_info CASCADE; +CREATE TABLE IF NOT EXISTS monitor.pgbackrest_info (config_file text NOT NULL, data jsonb NOT NULL, gather_timestamp timestamptz DEFAULT now() NOT NULL); +-- Force more aggressive autovacuum to avoid table bloat over time +ALTER TABLE monitor.pgbackrest_info SET (autovacuum_analyze_scale_factor = 0, autovacuum_vacuum_scale_factor = 0, autovacuum_vacuum_threshold = 10, autovacuum_analyze_threshold = 10); + +DROP FUNCTION IF EXISTS monitor.pgbackrest_info(); -- old version from 2.3 +DROP FUNCTION IF EXISTS monitor.pgbackrest_info(int); +CREATE OR REPLACE FUNCTION monitor.pgbackrest_info(p_throttle_minutes int DEFAULT 10) RETURNS SETOF monitor.pgbackrest_info + LANGUAGE plpgsql + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + +v_gather_timestamp timestamptz; +v_throttle interval; +v_system_identifier bigint; + +BEGIN +-- Get pgBackRest info in JSON format + +v_throttle := make_interval(mins := p_throttle_minutes); + +SELECT COALESCE(max(gather_timestamp), '1970-01-01'::timestamptz) INTO v_gather_timestamp FROM monitor.pgbackrest_info; + +IF pg_catalog.pg_is_in_recovery() = 'f' THEN + IF ((CURRENT_TIMESTAMP - v_gather_timestamp) > v_throttle) THEN + + -- Ensure table is empty + DELETE FROM monitor.pgbackrest_info; + + SELECT system_identifier into v_system_identifier FROM pg_control_system(); + + -- Copy data into the table directory from the pgBackRest into command + EXECUTE format( $cmd$ COPY monitor.pgbackrest_info (config_file, data) FROM program '/usr/bin/pgbackrest-info.sh %s' WITH (format text,DELIMITER '|') $cmd$, v_system_identifier::text ); + + END IF; +END IF; + +RETURN QUERY SELECT * FROM monitor.pgbackrest_info; + +IF NOT FOUND THEN + RAISE EXCEPTION 'No backups being returned from pgbackrest info command'; +END IF; + +END +$function$; + + +DROP FUNCTION IF EXISTS monitor.sequence_status(); +CREATE FUNCTION monitor.sequence_status() RETURNS TABLE (sequence_name text, last_value bigint, slots numeric, used numeric, percent int, cycle boolean, numleft numeric, table_usage text) + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp +AS $function$ + +/* + * Provide detailed status information of sequences in the current database + */ + +WITH default_value_sequences AS ( + -- Get sequences defined as default values with related table + -- Note this subquery can be locked/hung by DDL that affects tables with sequences. + -- Use monitor.sequence_exhaustion() to actually monitor for sequences running out + SELECT s.seqrelid, c.oid + FROM pg_catalog.pg_attribute a + JOIN pg_catalog.pg_attrdef ad on (ad.adrelid,ad.adnum) = (a.attrelid,a.attnum) + JOIN pg_catalog.pg_class c on a.attrelid = c.oid + JOIN pg_catalog.pg_sequence s ON s.seqrelid = regexp_replace(pg_get_expr(ad.adbin,ad.adrelid), $re$^nextval\('(.+?)'::regclass\)$$re$, $re$\1$re$)::regclass + WHERE (pg_get_expr(ad.adbin,ad.adrelid)) ~ '^nextval\(' +), dep_sequences AS ( + -- Get sequences set as dependencies with related tables (identities) + SELECT s.seqrelid, c.oid + FROM pg_catalog.pg_sequence s + JOIN pg_catalog.pg_depend d ON s.seqrelid = d.objid + JOIN pg_catalog.pg_class c ON d.refobjid = c.oid + UNION + SELECT seqrelid, oid FROM default_value_sequences +), all_sequences AS ( + -- Get any remaining sequences + SELECT s.seqrelid AS sequence_oid, ds.oid AS table_oid + FROM pg_catalog.pg_sequence s + LEFT JOIN dep_sequences ds ON s.seqrelid = ds.seqrelid +) +SELECT sequence_name + , last_value + , slots + , used + , ROUND(used/slots*100)::int AS percent + , cycle + , CASE WHEN slots < used THEN 0 ELSE slots - used END AS numleft + , table_usage +FROM ( + SELECT format('%I.%I',s.schemaname, s.sequencename)::text AS sequence_name + , COALESCE(s.last_value,s.min_value) AS last_value + , s.cycle + , CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots + , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used + , string_agg(a.table_oid::regclass::text, ', ') AS table_usage + FROM pg_catalog.pg_sequences s + JOIN all_sequences a ON (format('%I.%I', s.schemaname, s.sequencename))::regclass = a.sequence_oid + GROUP BY 1,2,3,4,5 +) x +ORDER BY ROUND(used/slots*100) DESC + +$function$; + + +DROP FUNCTION IF EXISTS monitor.sequence_exhaustion(int); +CREATE FUNCTION monitor.sequence_exhaustion(p_percent integer DEFAULT 75, OUT count bigint) + LANGUAGE sql SECURITY DEFINER STABLE + SET search_path TO pg_catalog, pg_temp +AS $function$ + +/* + * Returns count of sequences that have used up the % value given via the p_percent parameter (default 75%) + */ + +SELECT count(*) AS count +FROM ( + SELECT CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots + , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used + FROM pg_catalog.pg_sequences s +) x +WHERE (ROUND(used/slots*100)::int) > p_percent; + +$function$; + +/* + * Tables and functions for monitoring changes to pg_settings and pg_hba_file_rules system catalogs. + * Can't just do a raw check for the hash value since Prometheus only records numeric values for alerts + * Tables allow recording of existing settings so they can be referred back to to see what changed + * If either checksum function returns 0, then NO settings have changed + * If either checksum function returns 1, then something has changed since last known valid state + * For replicas, logging past settings is not possible to compare what may have changed + * For replicas, by default, it is expected that its settings will match the primary + * For replicas, if the pg_settings or pg_hba.conf are necessarily different from the primary, a known good hash of that replica's + settings can be sent as an argument to the relevant checksum function. Views are provided to easily obtain the hash values used by this monitoring tool. + * If any known hash parameters are passed to the checksum functions, note that it will override any past hash values stored in the log table when doing comparisons and completely re-evaluate the entire state. This is true even if done on a primary where the current state will then also be logged for comparison if it differs from the given hash. + */ + +DROP TABLE IF EXISTS monitor.pg_settings_checksum; +DROP TABLE IF EXISTS monitor.pg_hba_checksum; + +CREATE TABLE monitor.pg_settings_checksum ( + settings_hash_generated text NOT NULL + , settings_hash_known_provided text + , settings_string text NOT NULL + , created_at timestamptz DEFAULT now() NOT NULL + , valid smallint NOT NULL ); + +COMMENT ON COLUMN monitor.pg_settings_checksum.valid IS 'Set this column to zero if this group of settings is a valid change'; +CREATE INDEX ON monitor.pg_settings_checksum (created_at); + +CREATE TABLE monitor.pg_hba_checksum ( + hba_hash_generated text NOT NULL + , hba_hash_known_provided text + , hba_string text NOT NULL + , created_at timestamptz DEFAULT now() NOT NULL + , valid smallint NOT NULL ); + +COMMENT ON COLUMN monitor.pg_hba_checksum.valid IS 'Set this column to zero if this group of settings is a valid change'; +CREATE INDEX ON monitor.pg_hba_checksum (created_at); + + +DROP FUNCTION IF EXISTS monitor.pg_settings_checksum(text); +CREATE FUNCTION monitor.pg_settings_checksum(p_known_settings_hash text DEFAULT NULL) + RETURNS smallint + LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + +v_is_in_recovery boolean; +v_settings_hash text; +v_settings_hash_old text; +v_settings_match smallint := 0; +v_settings_string text; +v_settings_string_old text; +v_valid smallint; + +BEGIN + +SELECT pg_is_in_recovery() INTO v_is_in_recovery; + +SELECT md5_hash + , settings_string +INTO v_settings_hash + , v_settings_string +FROM monitor.pg_settings_hash; + +SELECT settings_hash_generated, valid +INTO v_settings_hash_old, v_valid +FROM monitor.pg_settings_checksum +ORDER BY created_at DESC LIMIT 1; + +IF p_known_settings_hash IS NOT NULL THEN + v_settings_hash_old := p_known_settings_hash; + -- Do not base validity on the stored value if manual hash is given. + v_valid := 0; +END IF; + +IF (v_settings_hash_old IS NOT NULL) THEN + + IF (v_settings_hash != v_settings_hash_old) THEN + + v_valid := 1; + + IF v_is_in_recovery = false THEN + INSERT INTO monitor.pg_settings_checksum ( + settings_hash_generated + , settings_hash_known_provided + , settings_string + , valid) + VALUES ( + v_settings_hash + , p_known_settings_hash + , v_settings_string + , v_valid); + END IF; + END IF; + +ELSE + + v_valid := 0; + IF v_is_in_recovery = false THEN + INSERT INTO monitor.pg_settings_checksum ( + settings_hash_generated + , settings_hash_known_provided + , settings_string + , valid) + VALUES (v_settings_hash + , p_known_settings_hash + , v_settings_string + , v_valid); + END IF; + +END IF; + +RETURN v_valid; + +END +$function$; + + +DROP FUNCTION IF EXISTS monitor.pg_hba_checksum(text); +CREATE FUNCTION monitor.pg_hba_checksum(p_known_hba_hash text DEFAULT NULL) + RETURNS smallint + LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + +v_hba_hash text; +v_hba_hash_old text; +v_hba_match smallint := 0; +v_hba_string text; +v_hba_string_old text; +v_is_in_recovery boolean; +v_valid smallint; + +BEGIN + +SELECT pg_is_in_recovery() INTO v_is_in_recovery; + +IF current_setting('server_version_num')::int >= 100000 THEN + + SELECT md5_hash + , hba_string + INTO v_hba_hash + , v_hba_string + FROM monitor.pg_hba_hash; + +ELSE + RAISE EXCEPTION 'pg_hba change monitoring unsupported in versions older than PostgreSQL 10'; +END IF; + +SELECT hba_hash_generated, valid +INTO v_hba_hash_old, v_valid +FROM monitor.pg_hba_checksum +ORDER BY created_at DESC LIMIT 1; + +IF p_known_hba_hash IS NOT NULL THEN + v_hba_hash_old := p_known_hba_hash; + -- Do not base validity on the stored value if manual hash is given. + v_valid := 0; +END IF; + +IF (v_hba_hash_old IS NOT NULL) THEN + + IF (v_hba_hash != v_hba_hash_old) THEN + + v_valid := 1; + + IF v_is_in_recovery = false THEN + INSERT INTO monitor.pg_hba_checksum ( + hba_hash_generated + , hba_hash_known_provided + , hba_string + , valid) + VALUES ( + v_hba_hash + , p_known_hba_hash + , v_hba_string + , v_valid); + END IF; + END IF; + +ELSE + + v_valid := 0; + IF v_is_in_recovery = false THEN + INSERT INTO monitor.pg_hba_checksum ( + hba_hash_generated + , hba_hash_known_provided + , hba_string + , valid) + VALUES (v_hba_hash + , p_known_hba_hash + , v_hba_string + , v_valid); + END IF; + +END IF; + +RETURN v_valid; + +END +$function$; + + +DROP FUNCTION IF EXISTS monitor.pg_settings_checksum_set_valid(); +/* + * This function provides quick, clear interface for resetting the checksum monitor to treat the currently detected configuration as valid after alerting on a change. Note that configuration history will be cleared. + */ +CREATE FUNCTION monitor.pg_settings_checksum_set_valid() RETURNS smallint + LANGUAGE sql +AS $function$ + +TRUNCATE monitor.pg_settings_checksum; + +SELECT monitor.pg_settings_checksum(); + +$function$; + + +DROP FUNCTION IF EXISTS monitor.pg_hba_checksum_set_valid(); +/* + * This function provides quick, clear interface for resetting the checksum monitor to treat the currently detected configuration as valid after alerting on a change. Note that configuration history will be cleared. + */ +CREATE FUNCTION monitor.pg_hba_checksum_set_valid() RETURNS smallint + LANGUAGE sql +AS $function$ + +TRUNCATE monitor.pg_hba_checksum; + +SELECT monitor.pg_hba_checksum(); + +$function$; + + +DROP VIEW IF EXISTS monitor.pg_settings_hash; +CREATE VIEW monitor.pg_settings_hash AS + WITH settings_ordered_list AS ( + SELECT name + , COALESCE(setting, '<>') AS setting + FROM pg_catalog.pg_settings + ORDER BY name, setting) + SELECT md5(string_agg(name||setting, ',')) AS md5_hash + , string_agg(name||setting, ',') AS settings_string + FROM settings_ordered_list; + + +DROP VIEW IF EXISTS monitor.pg_hba_hash; +CREATE VIEW monitor.pg_hba_hash AS + -- Order by line number so it's caught if no content is changed but the order of entries is changed + WITH hba_ordered_list AS ( + SELECT COALESCE(type, '<>') AS type + , array_to_string(COALESCE(database, ARRAY['<>']), ',') AS database + , array_to_string(COALESCE(user_name, ARRAY['<>']), ',') AS user_name + , COALESCE(address, '<>') AS address + , COALESCE(netmask, '<>') AS netmask + , COALESCE(auth_method, '<>') AS auth_method + , array_to_string(COALESCE(options, ARRAY['<>']), ',') AS options + FROM pg_catalog.pg_hba_file_rules + ORDER BY line_number) + SELECT md5(string_agg(type||database||user_name||address||netmask||auth_method||options, ',')) AS md5_hash + , string_agg(type||database||user_name||address||netmask||auth_method||options, ',') AS hba_string + FROM hba_ordered_list; + + + +DROP TABLE IF EXISTS monitor.pg_stat_statements_reset_info; +-- Table to store last reset time for pg_stat_statements +CREATE TABLE monitor.pg_stat_statements_reset_info( + reset_time timestamptz +); + +DROP FUNCTION IF EXISTS monitor.pg_stat_statements_reset_info(int); +-- Function to reset pg_stat_statements periodically +CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer DEFAULT 1440) + RETURNS bigint + LANGUAGE plpgsql + SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + + v_reset_timestamp timestamptz; + v_throttle interval; + +BEGIN + + IF p_throttle_minutes < 0 THEN + RETURN 0; + END IF; + + v_throttle := make_interval(mins := p_throttle_minutes); + + SELECT COALESCE(max(reset_time), '1970-01-01'::timestamptz) INTO v_reset_timestamp FROM monitor.pg_stat_statements_reset_info; + + IF ((CURRENT_TIMESTAMP - v_reset_timestamp) > v_throttle) THEN + -- Ensure table is empty + DELETE FROM monitor.pg_stat_statements_reset_info; + PERFORM pg_stat_statements_reset(); + INSERT INTO monitor.pg_stat_statements_reset_info(reset_time) values (now()); + END IF; + + RETURN (SELECT extract(epoch from reset_time) FROM monitor.pg_stat_statements_reset_info); + +EXCEPTION + WHEN others then + RETURN 0; +END +$function$; + +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA monitor TO ccp_monitoring; +GRANT ALL ON ALL TABLES IN SCHEMA monitor TO ccp_monitoring; diff --git a/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14 b/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14 new file mode 100644 index 00000000..6a5af558 --- /dev/null +++ b/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14 @@ -0,0 +1,24 @@ +### +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/postgres_exporter) + +# --web.listen-address: change '0.0.0.0' to the network IP assigned to this system if necessary, otherwise it will listen on any IP. Change port as necessary if running multiple instances. +# --extend.query-path: location of file containing custom queries to run. Location below is one recommended by crunchy setup steps. +# DATA_SOURCE_NAME: psql connection string. set the database that the exporter will connect to. Default is `postgres`. +# QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. + +PGBACKREST_INFO_THROTTLE_MINUTES=10 +PG_STAT_STATEMENTS_LIMIT=20 +PG_STAT_STATEMENTS_THROTTLE_MINUTES=-1 +OPT="--web.listen-address=0.0.0.0:9187 --extend.query-path=/etc/postgres_exporter/14/queries.yml --disable-default-metrics --disable-settings-metrics" +DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" + +# For global / cluster metric +QUERY_FILE_LIST="/etc/postgres_exporter/14/queries_global.yml /etc/postgres_exporter/14/queries_pg14.yml" + +# Recommend running separate exporter service to collect per-db metrics, even if there is only one database in the instance. Allows easier expansion to support multiple databases at a later time. See postgres_exporter_pg##_per_db file. + diff --git a/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14_per_db b/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14_per_db new file mode 100644 index 00000000..9cba179a --- /dev/null +++ b/postgres_exporter/linux/pg14/sysconfig.postgres_exporter_pg14_per_db @@ -0,0 +1,23 @@ +### +# +# Copyright 2017-2021 Crunchy Data Solutions, Inc. All Rights Reserved. +# +### + +# This file must be in a pathname that matches the EnvironmentFile entry in the service file (Default: /etc/sysconfig/postgres_exporter) + +# --web.listen-address: change '0.0.0.0' to the network IP assigned to this system if necessary, otherwise it will listen on any IP. Change port as necessary if running multiple instances. +# --extend.query-path: location of file containing custom queries to run. Location below is one recommended by crunchy setup steps. +# DATA_SOURCE_NAME: psql connection string. set the database that the exporter will connect to. Default is `postgres`. +# QUERY_FILE_LIST: space delimitted yml files that will be concatenated to a single queries.yml file. This should only be set once. If set multiple times, last one wins. + +# Ensure that listening port is different than global postgres_exporter (9188 below). Also ensure that query-path file is different from global one (queries_all_db_stats.yml below). +OPT="--web.listen-address=0.0.0.0:9188 --extend.query-path=/etc/postgres_exporter/14/queries_all_db_stats.yml --disable-default-metrics --disable-settings-metrics" +DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" +# As of v0.5.x of postgres_exporter, multiple DSN's can be given to connect to multiple databases with a single exporter. Separate DSNs with a comma and no spaces. Database name in example below is first the value after the "///" before the "?". Additional connection options can be given after "?", separated by "&". +#DATA_SOURCE_NAME="postgresql:///postgres?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable,postgresql:///mydb1?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable,postgresql:///mydb2?host=/var/run/postgresql/&user=ccp_monitoring&sslmode=disable" + +# Ex: For per db metric. DO NOT add queries that return the same labels/values on all databases otherwise the exporter will throw errors. The queries that pgmonitor provides for per-db metrics all include a "dbname" label to distinguish them. +QUERY_FILE_LIST="/etc/postgres_exporter/14/queries_per_db.yml" +# QUERY_FILE_LIST="/etc/postgres_exporter/14/queries_per_db.yml /etc/postgres_exporter/14/queries_bloat.yml" +