From 10746a867030b95f4ec68c30e6e6c13afe3ab137 Mon Sep 17 00:00:00 2001 From: awgreene Date: Mon, 11 Nov 2019 09:19:24 -0500 Subject: [PATCH] Add OLM CSV metrics --- docs/data-collection.md | 8 +++++++- docs/sample-metrics.md | 2 +- jsonnet/telemeter/metrics.jsonnet | 8 +++++++- manifests/benchmark/statefulSetTelemeterServer.yaml | 2 ++ manifests/client/deployment.yaml | 2 ++ 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/data-collection.md b/docs/data-collection.md index 59f362566..5ec71b7f9 100644 --- a/docs/data-collection.md +++ b/docs/data-collection.md @@ -105,7 +105,13 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib // subscription_sync_total is the number of times an OLM operator // Subscription has been synced, labelled by name and installed csv '{__name__="subscription_sync_total"}', - // + // csv_succeeded is unique to the namespace, name, version, and phase labels. + // The metrics is always present and can be equal to 0 or 1, where 0 represents that the + // csv is not in the succeeded state while 1 represents that the csv is in the succeeded state. + '{__name__="csv_succeeded"}', + // csv_abnormal represents the reason why a csv is not in the succeeded state and includes the + // namespace, name, version, phase, reason labels. When a csv is updated, the previous time series associated with the csv will be deleted. + '{__name__="csv_abnormal"}', // OCS metrics to be collected: // ceph_cluster_total_bytes gives the size of ceph cluster in bytes. '{__name__="ceph_cluster_total_bytes"}', diff --git a/docs/sample-metrics.md b/docs/sample-metrics.md index 72cd1371c..e5b6542cb 100644 --- a/docs/sample-metrics.md +++ b/docs/sample-metrics.md @@ -13,7 +13,7 @@ return the full set of metrics that the Telemeter client captures: [embedmd]:# (telemeter_query txt) ```txt -{__name__="up"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="node_uname_info"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"} +{__name__="up"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="node_uname_info"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="csv_succeeded"} or {__name__="csv_abnormal"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"} ``` For reference, here is an example response produced by a running OpenShift cluster: diff --git a/jsonnet/telemeter/metrics.jsonnet b/jsonnet/telemeter/metrics.jsonnet index 137cc5e26..996e919e4 100644 --- a/jsonnet/telemeter/metrics.jsonnet +++ b/jsonnet/telemeter/metrics.jsonnet @@ -97,7 +97,13 @@ // subscription_sync_total is the number of times an OLM operator // Subscription has been synced, labelled by name and installed csv '{__name__="subscription_sync_total"}', - // + // csv_succeeded is unique to the namespace, name, version, and phase labels. + // The metrics is always present and can be equal to 0 or 1, where 0 represents that the + // csv is not in the succeeded state while 1 represents that the csv is in the succeeded state. + '{__name__="csv_succeeded"}', + // csv_abnormal represents the reason why a csv is not in the succeeded state and includes the + // namespace, name, version, phase, reason labels. When a csv is updated, the previous time series associated with the csv will be deleted. + '{__name__="csv_abnormal"}', // OCS metrics to be collected: // ceph_cluster_total_bytes gives the size of ceph cluster in bytes. '{__name__="ceph_cluster_total_bytes"}', diff --git a/manifests/benchmark/statefulSetTelemeterServer.yaml b/manifests/benchmark/statefulSetTelemeterServer.yaml index ae1b8dfdc..72efb2135 100644 --- a/manifests/benchmark/statefulSetTelemeterServer.yaml +++ b/manifests/benchmark/statefulSetTelemeterServer.yaml @@ -52,6 +52,8 @@ spec: - --whitelist={__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} - --whitelist={__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} - --whitelist={__name__="subscription_sync_total"} + - --whitelist={__name__="csv_succeeded"} + - --whitelist={__name__="csv_abnormal"} - --whitelist={__name__="ceph_cluster_total_bytes"} - --whitelist={__name__="ceph_cluster_total_used_raw_bytes"} - --whitelist={__name__="ceph_health_status"} diff --git a/manifests/client/deployment.yaml b/manifests/client/deployment.yaml index b63ebe600..4bd30e663 100644 --- a/manifests/client/deployment.yaml +++ b/manifests/client/deployment.yaml @@ -54,6 +54,8 @@ spec: - --match={__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} - --match={__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} - --match={__name__="subscription_sync_total"} + - --match={__name__="csv_succeeded"} + - --match={__name__="csv_abnormal"} - --match={__name__="ceph_cluster_total_bytes"} - --match={__name__="ceph_cluster_total_used_raw_bytes"} - --match={__name__="ceph_health_status"}