From ea28d08a1566e9666b83ac0724e82e4f619f2629 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 18 Jun 2019 17:02:30 -0700 Subject: [PATCH 01/12] hard code config for UST CCP team --- installer/conf/telegraf-rs.conf | 38 ++++++++++++++++++++++++++++++++ installer/conf/telegraf.conf | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index cb9a36685..d7e6fd16c 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -535,6 +535,44 @@ # insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] # [inputs.prometheus.tagpass] +[[inputs.prometheus]] + #name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] + #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics", "https://kube-state-metrics.monitoring:9443"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] [[inputs.exec]] ## Commands array diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 06b1c55eb..185fea5be 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -568,6 +568,45 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] +[[inputs.prometheus]] + #name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] + #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + [[inputs.exec]] ## Commands array interval = "15m" From e9cfdaa41cb997c3d104a5b4b456fc163759fb1a Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 21 Jun 2019 14:48:49 -0700 Subject: [PATCH 02/12] fix config --- installer/conf/td-agent-bit-rs.conf | 7 ++++--- installer/conf/telegraf-rs.conf | 4 ++-- installer/conf/telegraf.conf | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 7945261aa..0e7218a2b 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -8,11 +8,12 @@ Name tail Tag oms.container.log.telegraf.err.* Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 2m + DB /var/opt/microsoft/docker-cimprov/state/telegraf-rs-log-state.db + DB.Sync Off + Mem_Buf_Limit 1m Path_Key filepath Skip_Long_Lines On - Ignore_Older 5m + Ignore_Older 2m [INPUT] Name tcp diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index d7e6fd16c..75ecadb34 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -75,7 +75,7 @@ ## Run telegraf with debug log messages. debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = true + quiet = false ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -545,7 +545,7 @@ url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics", "https://kube-state-metrics.monitoring:9443"] + kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics","http://prometheus-operator.monitoring:8080/metrics", "http://prometheus-k8s.monitoring:9090/metrics","https://kube-state-metrics.monitoring:8443/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 185fea5be..2868f3c8b 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -571,7 +571,7 @@ [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] + urls = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] metric_version = 2 From f2724f04d7e5e6e41e8349a9a0930e177495abad Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 21 Jun 2019 15:04:44 -0700 Subject: [PATCH 03/12] fix config after discussion --- installer/conf/telegraf-rs.conf | 2 +- installer/conf/telegraf.conf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index 75ecadb34..abf0fef14 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -545,7 +545,7 @@ url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics","http://prometheus-operator.monitoring:8080/metrics", "http://prometheus-k8s.monitoring:9090/metrics","https://kube-state-metrics.monitoring:8443/metrics"] + kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics","http://prometheus-operator.monitoring:8080/metrics", "http://prometheus-k8s.monitoring:9090/metrics","https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 2868f3c8b..80c3dd564 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -571,14 +571,14 @@ [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - urls = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] + #urls = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] metric_version = 2 url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + kubernetes_services = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" From ffc7a378090623560c3a4e23bd590312db12886a Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 21 Jun 2019 15:16:35 -0700 Subject: [PATCH 04/12] fix error log to get errros --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 80c3dd564..226be2f43 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -75,7 +75,7 @@ ## Run telegraf with debug log messages. debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = true + quiet = false ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" From 7146253e878d72a77bd968f2206e8965e7827577 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 21 Jun 2019 18:02:10 -0700 Subject: [PATCH 05/12] fix config --- installer/conf/telegraf-rs.conf | 2 +- installer/conf/telegraf.conf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index abf0fef14..07233cf15 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -545,7 +545,7 @@ url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = ["http://prometheus-operated.monitoring:9090/metrics","http://prometheus-operator.monitoring:8080/metrics", "http://prometheus-k8s.monitoring:9090/metrics","https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics"] + kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 226be2f43..eadb60377 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -571,14 +571,14 @@ [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - #urls = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] + urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "https://$NODE_IP:9100/metrics"] #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] metric_version = 2 url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = ["http://kubelet.kube-system:10255/metrics", "http://kubelet.kube-system:10255/cadvisor", "http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics", "https://node-exporter.monitoring:9100/metrics"] + #kubernetes_services = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "https://$NODE_IP:9100/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" From 0172a022a608aa18ab8f117a55f1d59f334a743d Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 24 Jun 2019 16:52:49 -0700 Subject: [PATCH 06/12] update config --- installer/conf/telegraf-rs.conf | 14 +++++++++++++- installer/conf/telegraf.conf | 13 ++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index 07233cf15..bde27f279 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -535,17 +535,29 @@ # insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] # [inputs.prometheus.tagpass] + +#Prometheus Custom Metrics [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + interval: "$AZMON_RS_PROM_INTERVAL" + ## An array of urls to scrape metrics from. + urls = ["$AZMON_RS_PROM_URLS"] + + kubernetes_services = ["$AZMON_RS_PROM_K8S_SERVICES"] + monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS + + + fieldpass = ["$AZMON_RS_PROM_FIELDPASS"] + fielddrop = ["$AZMON_RS_PROM_FIELDDROP"] metric_version = 2 url_tag = "scrapeUrl" ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] + #kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index eadb60377..fa1d72ea7 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -568,11 +568,18 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + +## prometheus custom metrics [[inputs.prometheus]] - #name_prefix="container.azm.ms/" + + interval: "$AZMON_DS_PROM_INTERVAL" + ## An array of urls to scrape metrics from. - urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "https://$NODE_IP:9100/metrics"] - #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + urls = ["$AZMON_DS_PROM_URLS"] + + fieldpass = ["$AZMON_DS_PROM_FIELDPASS"] + + fielddrop = ["$AZMON_DS_PROM_FIELDDROP"] metric_version = 2 url_tag = "scrapeUrl" From 4b213e2c44b8bcffee3900078b4d3f44848ac524 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 27 Jun 2019 15:37:47 -0700 Subject: [PATCH 07/12] Add telemetry --- source/code/go/src/plugins/oms.go | 4 +-- .../code/plugin/CAdvisorMetricsAPIClient.rb | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index b925e7145..301aff1ed 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -34,14 +34,12 @@ const ResourceIdEnv = "AKS_RESOURCE_ID" //env variable which has ResourceName for NON-AKS const ResourceNameEnv = "ACS_RESOURCE_NAME" -// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags) +// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags and also for custom-metrics telemetry ) const TelegrafMetricOriginPrefix = "container.azm.ms" // Origin suffix for telegraf Metrics (used as suffix for origin field) const TelegrafMetricOriginSuffix = "telegraf" -// Namespace prefix for telegraf Metrics (used as prefix for Namespace field) -//const TelegrafMetricNamespacePrefix = "plugin" // clusterName tag const TelegrafTagClusterName = "clusterName" diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index b842edb29..ec38bcbb5 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -14,12 +14,31 @@ class CAdvisorMetricsAPIClient require_relative "ApplicationInsightsUtility" @configMapMountPath = "/etc/config/settings/log-data-collection-settings" + @promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] + + @rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] + @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] + + @rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] + @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] + + @rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] + @dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"] + + @rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] + + @rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] + @dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"] + + @rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] + + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -199,7 +218,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName telemetryProps["Computer"] = hostName - #telemetry about custom log collections setting + #telemetry about log collections settings if (File.file?(@configMapMountPath)) telemetryProps["clustercustomsettings"] = true telemetryProps["clusterenvvars"] = @clusterEnvVarCollectionEnabled @@ -209,6 +228,19 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["clusterLogTailPath"] = @clusterLogTailPath telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion end + #telemetry about prometheus metric collections settings + if (File.file?(@promConfigMountPath)) + telemetryProps["rsPromInt"] = @rsPromInterval + telemetryProps["dsPromInt"] = @dsPromInterval + telemetryProps["rsPromFPC"] = @rsPromFieldPassCount + telemetryProps["dsPromFPC"] = @dsPromFieldPassCount + telemetryProps["rsPromFDC"] = @rsPromFieldDropCount + telemetryProps["dsPromFDC"] = @dsPromFieldDropCount + telemetryProps["rsPromServ"] = @rsPromK8sServiceCount + telemetryProps["rsPromUrl"] = @rsPromUrlCount + telemetryProps["dsPromUrl"] = @dsPromUrlCount + telemetryProps["rsPromMonPods"] = @rsPromMonitorPods + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end From 903071b411f5babe3b9296b60a8de03b9746c8d2 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 27 Jun 2019 16:07:13 -0700 Subject: [PATCH 08/12] Rashmi/promcustomconfig (#231) * changes * formatting changes * changes * changes * changes * changes * changes * changes * changes * changes * adding telemetry * changes * changes * changes * changes * changes * changes * changes * cahnges * changes --- installer/conf/telegraf-rs.conf | 2 +- installer/conf/telegraf-test-rs.conf | 113 +++++++++++ installer/conf/telegraf-test.conf | 100 ++++++++++ installer/conf/telegraf.conf | 2 +- installer/datafiles/base_container.data | 5 +- .../scripts/tomlparser-prom-customconfig.rb | 184 ++++++++++++++++++ installer/scripts/tomlparser.rb | 82 ++++---- 7 files changed, 444 insertions(+), 44 deletions(-) create mode 100644 installer/conf/telegraf-test-rs.conf create mode 100644 installer/conf/telegraf-test.conf create mode 100644 installer/scripts/tomlparser-prom-customconfig.rb diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index bde27f279..8e8665104 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -542,7 +542,7 @@ ## An array of urls to scrape metrics from. #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] - interval: "$AZMON_RS_PROM_INTERVAL" + interval = "$AZMON_RS_PROM_INTERVAL" ## An array of urls to scrape metrics from. urls = ["$AZMON_RS_PROM_URLS"] diff --git a/installer/conf/telegraf-test-rs.conf b/installer/conf/telegraf-test-rs.conf new file mode 100644 index 000000000..4ece2bf8c --- /dev/null +++ b/installer/conf/telegraf-test-rs.conf @@ -0,0 +1,113 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +#Prometheus Custom Metrics +[[inputs.prometheus]] + ## An array of urls to scrape metrics from. + interval = "$AZMON_RS_PROM_INTERVAL" + + ## An array of urls to scrape metrics from. + #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] + urls = ["$AZMON_RS_PROM_URLS"] + + #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + fieldpass = ["$AZMON_RS_PROM_FIELDPASS"] + fielddrop = ["$AZMON_RS_PROM_FIELDDROP"] + + ## An array of Kubernetes services to scrape metrics from. + #kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] + kubernetes_services = ["$AZMON_RS_PROM_K8S_SERVICES"] + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS + + metric_version = 2 + url_tag = "scrapeUrl" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true diff --git a/installer/conf/telegraf-test.conf b/installer/conf/telegraf-test.conf new file mode 100644 index 000000000..f1a7880ad --- /dev/null +++ b/installer/conf/telegraf-test.conf @@ -0,0 +1,100 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +#Prometheus Custom Metrics +[[inputs.prometheus]] + ## An array of urls to scrape metrics from. + interval = "$AZMON_DS_PROM_INTERVAL" + + ## An array of urls to scrape metrics from. + #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] + urls = ["$AZMON_DS_PROM_URLS"] + + fieldpass = ["$AZMON_DS_PROM_FIELDPASS"] + fielddrop = ["$AZMON_DS_PROM_FIELDDROP"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index fa1d72ea7..a83db55cf 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -572,7 +572,7 @@ ## prometheus custom metrics [[inputs.prometheus]] - interval: "$AZMON_DS_PROM_INTERVAL" + interval = "$AZMON_DS_PROM_INTERVAL" ## An array of urls to scrape metrics from. urls = ["$AZMON_DS_PROM_URLS"] diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 58a74aa0a..5a18805be 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -110,9 +110,12 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/telegraf-test.conf; installer/conf/telegraf-test.conf; 644; root; root +/opt/telegraf-test-rs.conf; installer/conf/telegraf-test-rs.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root +/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..5df83c89a --- /dev/null +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,184 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@configSchemaVersion = "" + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + puts "config::error::Exception while parsing toml config file for prometheus config: #{errorStr}, using defaults" + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if !arrayValue.nil? && arrayValue.kind_of?(Array) && arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType) + return true + else + return false + end +end + +def checkForType(variable, varType) + if !variable.nil? && variable.kind_of?(varType) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + # Checking to see if this is the daemonset or replicaset to parse config accordingly + controller = ENV["CONTROLLER_TYPE"] + if !controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + !monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + # Write the settings to file, so that they can be set as environment variables + file = File.open("prom_config_env_var", "w") + if !file.nil? + file.write("export AZMON_RS_PROM_INTERVAL=#{interval}\n") + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + file.write("export AZMON_RS_PROM_FIELDPASS=\"#{fieldPass.join("\",\"")}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export AZMON_RS_PROM_FIELDDROP=#{fieldDrop.join("\",\"")}\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export AZMON_RS_PROM_K8S_SERVICES=#{kubernetesServices.join("\",\"")}\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export AZMON_RS_PROM_URLS=#{urls.join("\",\"")}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + file.write("export AZMON_RS_PROM_MONITOR_PODS=#{monitorKubernetesPods}\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created custom config environment variable file for replicaset" + + #Also substitute these values in the test config file for telegraf + file_name = "telegraf-test-rs.conf" + text = File.read(file_name) + new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPass.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDrop.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", urls.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", kubernetesServices.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", (monitorKubernetesPods ? "true" : "false")) + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully replaced the settings in test telegraf config file for replicaset" + else + puts "config::error::Exception while opening file for writing prometheus replicaset config environment variables" + puts "****************End Prometheus Config Processing********************" + end + else + puts "config::Typecheck failed for prometheus config settings for replicaset, using defaults" + end # end of type check condition + rescue => errorStr + puts "config::error::Exception while reading config file for prometheus config for replicaset: #{errorStr}, using defaults" + puts "****************End Prometheus Config Processing********************" + end + elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + # Write the settings to file, so that they can be set as environment variables + file = File.open("prom_config_env_var", "w") + if !file.nil? + file.write("export AZMON_DS_PROM_INTERVAL=#{interval}\n") + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + file.write("export AZMON_DS_PROM_FIELDPASS=\"#{fieldPass.join("\",\"")}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export AZMON_DS_PROM_FIELDDROP=#{fieldDrop.join("\",\"")}\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export AZMON_DS_PROM_URLS=#{urls.join("\",\"")}\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created custom config environment variable file for daemonset" + + #Also substitute these values in the test config file for telegraf + file_name = "telegraf-test.conf" + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", fieldPass.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", fieldDrop.join("\",\"")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", urls.join("\",\"")) + # To write changes to the file, use: + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully replaced the settings in test telegraf config file for daemonset" + else + puts "config::error::Exception while opening file for writing prometheus daemonset config environment variables" + puts "****************End Prometheus Config Processing********************" + end + else + puts "config::Typecheck failed for prometheus config settings for daemonset, using defaults" + end # end of type check condition + rescue => errorStr + puts "config::error::Exception while reading config file for prometheus config for daemonset: #{errorStr}, using defaults" + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + puts "config::error:: Controller undefined while processing prometheus config, using defaults" + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index 3e7f48045..c72e64127 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -82,7 +82,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) if @collectStderrLogs && !stderrNamespaces.nil? if stderrNamespaces.kind_of?(Array) if !@stdoutExcludeNamespaces.nil? && !@stdoutExcludeNamespaces.empty? - stdoutNamespaces = @stdoutExcludeNamespaces.split(',') + stdoutNamespaces = @stdoutExcludeNamespaces.split(",") end # Checking only for the first element to be string because toml enforces the arrays to contain elements of same type if stderrNamespaces.length > 0 && stderrNamespaces[0].kind_of?(String) @@ -119,47 +119,47 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end - @configSchemaVersion = ENV['AZMON_AGENT_CFG_SCHEMA_VERSION'] - puts "****************Start Config Processing********************" - if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end - else - if (File.file?(@configMapMountPath)) - puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" - end - @excludePath = "*_kube-system_*.log" +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" end + @excludePath = "*_kube-system_*.log" +end - # Write the settings to file, so that they can be set as environment variables - file = File.open("config_env_var", "w") +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_env_var", "w") - if !file.nil? - # This will be used in td-agent-bit.conf file to filter out logs - if (!@collectStdoutLogs && !@collectStderrLogs) - #Stop log tailing completely - @logTailPath = "/opt/nolog*.log" - @logExclusionRegexPattern = "stdout|stderr" - elsif !@collectStdoutLogs - @logExclusionRegexPattern = "stdout" - elsif !@collectStderrLogs - @logExclusionRegexPattern = "stderr" - end - file.write("export AZMON_COLLECT_STDOUT_LOGS=#{@collectStdoutLogs}\n") - file.write("export AZMON_LOG_TAIL_PATH=#{@logTailPath}\n") - file.write("export AZMON_LOG_EXCLUSION_REGEX_PATTERN=\"#{@logExclusionRegexPattern}\"\n") - file.write("export AZMON_STDOUT_EXCLUDED_NAMESPACES=#{@stdoutExcludeNamespaces}\n") - file.write("export AZMON_COLLECT_STDERR_LOGS=#{@collectStderrLogs}\n") - file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") - file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") - file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") - # Close file after writing all environment variables - file.close - puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " - puts "****************End Config Processing********************" - else - puts "config::error::Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" +if !file.nil? + # This will be used in td-agent-bit.conf file to filter out logs + if (!@collectStdoutLogs && !@collectStderrLogs) + #Stop log tailing completely + @logTailPath = "/opt/nolog*.log" + @logExclusionRegexPattern = "stdout|stderr" + elsif !@collectStdoutLogs + @logExclusionRegexPattern = "stdout" + elsif !@collectStderrLogs + @logExclusionRegexPattern = "stderr" end + file.write("export AZMON_COLLECT_STDOUT_LOGS=#{@collectStdoutLogs}\n") + file.write("export AZMON_LOG_TAIL_PATH=#{@logTailPath}\n") + file.write("export AZMON_LOG_EXCLUSION_REGEX_PATTERN=\"#{@logExclusionRegexPattern}\"\n") + file.write("export AZMON_STDOUT_EXCLUDED_NAMESPACES=#{@stdoutExcludeNamespaces}\n") + file.write("export AZMON_COLLECT_STDERR_LOGS=#{@collectStderrLogs}\n") + file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") + file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") + file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + # Close file after writing all environment variables + file.close + puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " + puts "****************End Config Processing********************" +else + puts "config::error::Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end From d12c7df322f6b62263475e9df38e8a1e61429ffb Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 9 Jul 2019 11:52:30 -0700 Subject: [PATCH 09/12] Rashmi/promcustomconfig (#236) * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * fix exceptions * changes to remove some exceptions * exception fixes --- installer/conf/td-agent-bit-rs.conf | 11 - installer/conf/td-agent-bit.conf | 22 +- installer/conf/telegraf-rs.conf | 69 ++-- installer/conf/telegraf-test-rs.conf | 113 ------ installer/conf/telegraf-test.conf | 100 ------ installer/conf/telegraf.conf | 67 ++-- installer/datafiles/base_container.data | 2 - .../scripts/tomlparser-prom-customconfig.rb | 116 ++++--- source/code/go/src/plugins/oms.go | 20 +- source/code/go/src/plugins/out_oms.go | 2 - source/code/plugin/DockerApiClient.rb | 325 +++++++++--------- source/code/plugin/KubernetesApiClient.rb | 2 +- source/code/plugin/in_containerinventory.rb | 5 +- source/code/plugin/in_kube_events.rb | 138 ++++---- source/code/plugin/in_kube_nodes.rb | 134 ++++---- source/code/plugin/in_kube_podinventory.rb | 14 +- source/code/plugin/in_kube_services.rb | 191 +++++----- 17 files changed, 562 insertions(+), 769 deletions(-) delete mode 100644 installer/conf/telegraf-test-rs.conf delete mode 100644 installer/conf/telegraf-test.conf diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 0e7218a2b..7839b0eee 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -4,17 +4,6 @@ Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log -[INPUT] - Name tail - Tag oms.container.log.telegraf.err.* - Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-rs-log-state.db - DB.Sync Off - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 2dee26234..e7aabd242 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -6,7 +6,7 @@ [INPUT] Name tail - Tag oms.container.log.* + Tag oms.container.log.la.* Path ${AZMON_LOG_TAIL_PATH} DB /var/log/omsagent-fblogs.db DB.Sync Off @@ -32,17 +32,6 @@ Skip_Long_Lines On Ignore_Older 2m -[INPUT] - Name tail - Tag oms.container.log.telegraf.err.* - Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - DB.Sync Off - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - [INPUT] Name tcp Tag oms.container.perf.telegraf.* @@ -53,9 +42,16 @@ [FILTER] Name grep - Match oms.container.log.* + Match oms.container.log.la.* Exclude stream ${AZMON_LOG_EXCLUSION_REGEX_PATTERN} +# Exclude prometheus plugin exceptions that might be caused due to invalid config.(Logs which contain - E! [inputs.prometheus]) +# Excluding these logs from being sent to AI since it can result in high volume of data in telemetry due to invalid config. +[FILTER] + Name grep + Match oms.container.log.flbplugin.* + Exclude log E! [\[]inputs.prometheus[\]] + [OUTPUT] Name oms EnableTelemetry true diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index 8e8665104..53aa03620 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -75,9 +75,9 @@ ## Run telegraf with debug log messages. debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = false + quiet = true ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + logfile = "" ## Override default hostname, if empty use os.Hostname() #hostname = "placeholder_hostname" @@ -544,14 +544,13 @@ #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] interval = "$AZMON_RS_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = ["$AZMON_RS_PROM_URLS"] + urls = $AZMON_RS_PROM_URLS - kubernetes_services = ["$AZMON_RS_PROM_K8S_SERVICES"] + kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS - - fieldpass = ["$AZMON_RS_PROM_FIELDPASS"] - fielddrop = ["$AZMON_RS_PROM_FIELDDROP"] + fieldpass = $AZMON_RS_PROM_FIELDPASS + fielddrop = $AZMON_RS_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -586,32 +585,32 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -[[inputs.exec]] - ## Commands array - interval = "15m" - commands = [ - "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" - ] - - ## Timeout for each command to complete. - timeout = "15s" - - ## measurement name suffix (for separating different commands) - name_suffix = "_telemetry" - - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" - #tagexclude = ["hostName"] - [inputs.exec.tags] - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" +# [[inputs.exec]] +# ## Commands array +# interval = "15m" +# commands = [ +# "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" +# ] + +# ## Timeout for each command to complete. +# timeout = "15s" + +# ## measurement name suffix (for separating different commands) +# name_suffix = "_telemetry" + +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# #tagexclude = ["hostName"] +# [inputs.exec.tags] +# AgentVersion = "$AGENT_VERSION" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ClusterName = "$TELEMETRY_CLUSTER_NAME" +# ClusterType = "$TELEMETRY_CLUSTER_TYPE" +# Computer = "placeholder_hostname" +# ControllerType = "$CONTROLLER_TYPE" diff --git a/installer/conf/telegraf-test-rs.conf b/installer/conf/telegraf-test-rs.conf deleted file mode 100644 index 4ece2bf8c..000000000 --- a/installer/conf/telegraf-test-rs.conf +++ /dev/null @@ -1,113 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = true - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "0s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "60s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "0s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = false - ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -#Prometheus Custom Metrics -[[inputs.prometheus]] - ## An array of urls to scrape metrics from. - interval = "$AZMON_RS_PROM_INTERVAL" - - ## An array of urls to scrape metrics from. - #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] - urls = ["$AZMON_RS_PROM_URLS"] - - #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] - fieldpass = ["$AZMON_RS_PROM_FIELDPASS"] - fielddrop = ["$AZMON_RS_PROM_FIELDDROP"] - - ## An array of Kubernetes services to scrape metrics from. - #kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] - kubernetes_services = ["$AZMON_RS_PROM_K8S_SERVICES"] - - ## Scrape Kubernetes pods for the following prometheus annotations: - ## - prometheus.io/scrape: Enable scraping for this pod - ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to - ## set this to `https` & most likely set the tls config. - ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. - ## - prometheus.io/port: If port is not 9102 use this annotation - monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS - - metric_version = 2 - url_tag = "scrapeUrl" - - ## Use bearer token for authorization. ('bearer_token' takes priority) - bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" - - ## Specify timeout duration for slower prometheus clients (default is 3s) - response_timeout = "15s" - - ## Optional TLS Config - tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - ## Use TLS but skip chain & host verification - insecure_skip_verify = true diff --git a/installer/conf/telegraf-test.conf b/installer/conf/telegraf-test.conf deleted file mode 100644 index f1a7880ad..000000000 --- a/installer/conf/telegraf-test.conf +++ /dev/null @@ -1,100 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = true - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "0s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "60s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "0s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = false - ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -#Prometheus Custom Metrics -[[inputs.prometheus]] - ## An array of urls to scrape metrics from. - interval = "$AZMON_DS_PROM_INTERVAL" - - ## An array of urls to scrape metrics from. - #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] - urls = ["$AZMON_DS_PROM_URLS"] - - fieldpass = ["$AZMON_DS_PROM_FIELDPASS"] - fielddrop = ["$AZMON_DS_PROM_FIELDDROP"] - - metric_version = 2 - url_tag = "scrapeUrl" - - ## Use bearer token for authorization. ('bearer_token' takes priority) - bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" - - ## Specify timeout duration for slower prometheus clients (default is 3s) - response_timeout = "15s" - - ## Optional TLS Config - tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - ## Use TLS but skip chain & host verification - insecure_skip_verify = true diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index a83db55cf..47e71c5f5 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -75,10 +75,9 @@ ## Run telegraf with debug log messages. debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = false + quiet = true ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" - + logfile = "" ## Override default hostname, if empty use os.Hostname() #hostname = "placeholder_hostname" ## If set to true, do no set the "host" tag in the telegraf agent. @@ -575,11 +574,11 @@ interval = "$AZMON_DS_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = ["$AZMON_DS_PROM_URLS"] + urls = $AZMON_DS_PROM_URLS - fieldpass = ["$AZMON_DS_PROM_FIELDPASS"] + fieldpass = $AZMON_DS_PROM_FIELDPASS - fielddrop = ["$AZMON_DS_PROM_FIELDDROP"] + fielddrop = $AZMON_DS_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -614,31 +613,31 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -[[inputs.exec]] - ## Commands array - interval = "15m" - commands = [ - "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" - ] - - ## Timeout for each command to complete. - timeout = "15s" - - ## measurement name suffix (for separating different commands) - name_suffix = "_telemetry" - - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" - tagexclude = ["hostName"] - [inputs.exec.tags] - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" \ No newline at end of file +# [[inputs.exec]] +# ## Commands array +# interval = "15m" +# commands = [ +# "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" +# ] + +# ## Timeout for each command to complete. +# timeout = "15s" + +# ## measurement name suffix (for separating different commands) +# name_suffix = "_telemetry" + +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# tagexclude = ["hostName"] +# [inputs.exec.tags] +# AgentVersion = "$AGENT_VERSION" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ClusterName = "$TELEMETRY_CLUSTER_NAME" +# ClusterType = "$TELEMETRY_CLUSTER_TYPE" +# Computer = "placeholder_hostname" +# ControllerType = "$CONTROLLER_TYPE" \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 5a18805be..fe1635335 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -110,8 +110,6 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root -/opt/telegraf-test.conf; installer/conf/telegraf-test.conf; 644; root; root -/opt/telegraf-test-rs.conf; installer/conf/telegraf-test-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb index 5df83c89a..d9fdf1cc2 100644 --- a/installer/scripts/tomlparser-prom-customconfig.rb +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -1,11 +1,22 @@ #!/usr/local/bin/ruby require_relative "tomlrb" +require "fileutils" @promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" @replicaset = "replicaset" @daemonset = "daemonset" @configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +@defaultRsMonitorPods = false # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -27,7 +38,7 @@ def parseConfigMap end def checkForTypeArray(arrayValue, arrayType) - if !arrayValue.nil? && arrayValue.kind_of?(Array) && arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))) return true else return false @@ -35,7 +46,7 @@ def checkForTypeArray(arrayValue, arrayType) end def checkForType(variable, varType) - if !variable.nil? && variable.kind_of?(varType) + if variable.nil? || variable.kind_of?(varType) return true else return false @@ -66,47 +77,49 @@ def populateSettingValuesFromConfigMap(parsedConfig) checkForTypeArray(urls, String) && !monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby puts "config::Successfully passed typecheck for config settings for replicaset" - # Write the settings to file, so that they can be set as environment variables - file = File.open("prom_config_env_var", "w") + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + monitorKubernetesPods = (kubernetesServices.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", (monitorKubernetesPods ? "true" : "false")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") if !file.nil? - file.write("export AZMON_RS_PROM_INTERVAL=#{interval}\n") file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") - file.write("export AZMON_RS_PROM_FIELDPASS=\"#{fieldPass.join("\",\"")}\"\n") #Setting array lengths as environment variables for telemetry purposes file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export AZMON_RS_PROM_FIELDDROP=#{fieldDrop.join("\",\"")}\n") file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export AZMON_RS_PROM_K8S_SERVICES=#{kubernetesServices.join("\",\"")}\n") file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") - file.write("export AZMON_RS_PROM_URLS=#{urls.join("\",\"")}\n") file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") - file.write("export AZMON_RS_PROM_MONITOR_PODS=#{monitorKubernetesPods}\n") file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") # Close file after writing all environment variables file.close - puts "config::Successfully created custom config environment variable file for replicaset" - - #Also substitute these values in the test config file for telegraf - file_name = "telegraf-test-rs.conf" - text = File.read(file_name) - new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPass.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDrop.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", urls.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", kubernetesServices.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", (monitorKubernetesPods ? "true" : "false")) - - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully replaced the settings in test telegraf config file for replicaset" - else - puts "config::error::Exception while opening file for writing prometheus replicaset config environment variables" - puts "****************End Prometheus Config Processing********************" + puts "config::Successfully created telemetry file for replicaset" end else puts "config::Typecheck failed for prometheus config settings for replicaset, using defaults" end # end of type check condition rescue => errorStr - puts "config::error::Exception while reading config file for prometheus config for replicaset: #{errorStr}, using defaults" + puts "config::error::Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults" + setRsPromDefaults puts "****************End Prometheus Config Processing********************" end elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? @@ -123,41 +136,44 @@ def populateSettingValuesFromConfigMap(parsedConfig) checkForTypeArray(fieldDrop, String) && checkForTypeArray(urls, String) puts "config::Successfully passed typecheck for config settings for daemonset" - # Write the settings to file, so that they can be set as environment variables - file = File.open("prom_config_env_var", "w") + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") if !file.nil? - file.write("export AZMON_DS_PROM_INTERVAL=#{interval}\n") file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") - file.write("export AZMON_DS_PROM_FIELDPASS=\"#{fieldPass.join("\",\"")}\"\n") #Setting array lengths as environment variables for telemetry purposes file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export AZMON_DS_PROM_FIELDDROP=#{fieldDrop.join("\",\"")}\n") file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export AZMON_DS_PROM_URLS=#{urls.join("\",\"")}\n") file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") # Close file after writing all environment variables file.close - puts "config::Successfully created custom config environment variable file for daemonset" - - #Also substitute these values in the test config file for telegraf - file_name = "telegraf-test.conf" - text = File.read(file_name) - new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", fieldPass.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", fieldDrop.join("\",\"")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", urls.join("\",\"")) - # To write changes to the file, use: - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully replaced the settings in test telegraf config file for daemonset" - else - puts "config::error::Exception while opening file for writing prometheus daemonset config environment variables" - puts "****************End Prometheus Config Processing********************" + puts "config::Successfully created telemetry file for daemonset" end else puts "config::Typecheck failed for prometheus config settings for daemonset, using defaults" end # end of type check condition rescue => errorStr - puts "config::error::Exception while reading config file for prometheus config for daemonset: #{errorStr}, using defaults" + puts "config::error::Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults" puts "****************End Prometheus Config Processing********************" end end # end of controller type check diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 301aff1ed..319ff3551 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -191,7 +191,6 @@ func updateContainerImageNameMaps() { if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) Log(message) - SendException(message) continue } @@ -224,7 +223,7 @@ func populateExcludedStdoutNamespaces() { if (strings.Compare(collectStdoutLogs, "true") == 0) && (len(excludeList) > 0) { stdoutNSExcludeList = strings.Split(excludeList, ",") for _, ns := range stdoutNSExcludeList { - Log ("Excluding namespace %s for stdout log collection", ns) + Log("Excluding namespace %s for stdout log collection", ns) StdoutIgnoreNsSet[strings.TrimSpace(ns)] = true } } @@ -237,7 +236,7 @@ func populateExcludedStderrNamespaces() { if (strings.Compare(collectStderrLogs, "true") == 0) && (len(excludeList) > 0) { stderrNSExcludeList = strings.Split(excludeList, ",") for _, ns := range stderrNSExcludeList { - Log ("Excluding namespace %s for stderr log collection", ns) + Log("Excluding namespace %s for stderr log collection", ns) StderrIgnoreNsSet[strings.TrimSpace(ns)] = true } } @@ -382,7 +381,6 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if err != nil { message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) Log(message) - SendException(message) UpdateNumTelegrafMetricsSentTelemetry(0, 1) return output.FLB_RETRY } @@ -423,7 +421,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { nameIDMap := make(map[string]string) DataUpdateMutex.Lock() - + for k, v := range ImageIDMap { imageIDMap[k] = v } @@ -515,7 +513,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if err != nil { message := fmt.Sprintf("Error when sending request %s \n", err.Error()) Log(message) - SendException(message) + // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation + //SendException(message) Log("Failed to flush %d records after %s", len(dataItems), elapsed) return output.FLB_RETRY @@ -559,7 +558,7 @@ func GetContainerIDK8sNamespaceFromFileName(filename string) (string, string) { start := strings.LastIndex(filename, "-") end := strings.LastIndex(filename, ".") - + if start >= end || start == -1 || end == -1 { id = "" } else { @@ -639,7 +638,6 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) - // Populate Computer field containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) if err != nil { @@ -678,11 +676,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateHTTPClient() - if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - go updateContainerImageNameMaps() - } else { + go updateContainerImageNameMaps() + } else { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 0fa2ddd4b..e9e7124b7 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -64,8 +64,6 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { return PostTelegrafMetricsToLA(records) - } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { - return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } return PostDataHelper(records) diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index 5a46b5fdb..eb9d74531 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -2,179 +2,196 @@ # frozen_string_literal: true class DockerApiClient + require "socket" + require "json" + require "timeout" + require_relative "omslog" + require_relative "DockerApiRestHelper" + require_relative "ApplicationInsightsUtility" - require 'socket' - require 'json' - require 'timeout' - require_relative 'omslog' - require_relative 'DockerApiRestHelper' - require_relative 'ApplicationInsightsUtility' + @@SocketPath = "/var/run/host/docker.sock" + @@ChunkSize = 4096 + @@TimeoutInSeconds = 5 + @@PluginName = "ContainerInventory" - @@SocketPath = "/var/run/host/docker.sock" - @@ChunkSize = 4096 - @@TimeoutInSeconds = 5 - @@PluginName = 'ContainerInventory' + def initialize + end - def initialize - end - - class << self - # Make docker socket call for requests - def getResponse(request, isMultiJson, isVersion) - begin - socket = UNIXSocket.new(@@SocketPath) - dockerResponse = "" - isTimeOut = false - socket.write(request) - # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket. - loop do - begin - responseChunk = "" - timeout(@@TimeoutInSeconds) do - responseChunk = socket.recv(@@ChunkSize) - end - dockerResponse += responseChunk - rescue Timeout::Error - $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") - isTimeOut = true - break - end - break if (isVersion)? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") - end - socket.close - return (isTimeOut)? nil : parseResponse(dockerResponse, isMultiJson) - rescue => errorStr - $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + class << self + # Make docker socket call for requests + def getResponse(request, isMultiJson, isVersion) + begin + socket = UNIXSocket.new(@@SocketPath) + dockerResponse = "" + isTimeOut = false + socket.write(request) + # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket. + loop do + begin + responseChunk = "" + timeout(@@TimeoutInSeconds) do + responseChunk = socket.recv(@@ChunkSize) end + dockerResponse += responseChunk + rescue Timeout::Error + $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") + isTimeOut = true + break + end + break if (isVersion) ? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") end + socket.close + return (isTimeOut) ? nil : parseResponse(dockerResponse, isMultiJson) + rescue => errorStr + $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end - def parseResponse(dockerResponse, isMultiJson) - # Doing this because the response is in the raw format and includes headers. - # Need to do a regex match to extract the json part of the response - Anything between [{}] in response - parsedJsonResponse = nil - begin - jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/] - rescue => errorStr - $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - end - begin - if jsonResponse != nil - parsedJsonResponse = JSON.parse(jsonResponse) - end - rescue => errorStr - $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - return parsedJsonResponse - end + def parseResponse(dockerResponse, isMultiJson) + # Doing this because the response is in the raw format and includes headers. + # Need to do a regex match to extract the json part of the response - Anything between [{}] in response + parsedJsonResponse = nil + begin + jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/] + rescue => errorStr + $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + end + begin + if jsonResponse != nil + parsedJsonResponse = JSON.parse(jsonResponse) + end + rescue => errorStr + $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return parsedJsonResponse + end + def getDockerHostName() + dockerHostName = "" + request = DockerApiRestHelper.restDockerInfo + response = getResponse(request, false, false) + if (response != nil) + dockerHostName = response["Name"] + end + return dockerHostName + end - def getDockerHostName() - dockerHostName = "" - request = DockerApiRestHelper.restDockerInfo - response = getResponse(request, false, false) - if (response != nil) - dockerHostName = response['Name'] + def listContainers() + ids = [] + request = DockerApiRestHelper.restDockerPs + containers = getResponse(request, true, false) + if !containers.nil? && !containers.empty? + containers.each do |container| + labels = (!container["Labels"].nil?) ? container["Labels"] : container["labels"] + if !labels.nil? + labelKeys = labels.keys + dockerTypeLabel = labelKeys.find { |k| "io.kubernetes.docker.type".downcase == k.downcase } + if !dockerTypeLabel.nil? + dockerTypeLabelValue = labels[dockerTypeLabel] + # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers + if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) + # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that + # are created in the pods for ContainerInventory + keyValue = labelKeys.find { |k| "io.kubernetes.pod.uid".downcase == k.downcase } + if !labels[keyValue].nil? + ids.push(container["Id"]) + end + end end - return dockerHostName + end end + end + return ids + end - def listContainers() - ids = [] - request = DockerApiRestHelper.restDockerPs - containers = getResponse(request, true, false) - if !containers.nil? && !containers.empty? - containers.each do |container| - labels = (!container['Labels'].nil?)? container['Labels'] : container['labels'] - if !labels.nil? - labelKeys = labels.keys - dockerTypeLabel = labelKeys.find {|k| 'io.kubernetes.docker.type'.downcase == k.downcase} - if !dockerTypeLabel.nil? - dockerTypeLabelValue = labels[dockerTypeLabel] - # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers - if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) - # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that - # are created in the pods for ContainerInventory - keyValue = labelKeys.find {|k| 'io.kubernetes.pod.uid'.downcase == k.downcase} - if !labels[keyValue].nil? - ids.push(container['Id']) - end - end - end - end - end - end - return ids + # This method splits the tag value into an array - repository, image, tag, repodigest-imageid + def getImageRepositoryImageTag(tagValue, digestValue) + result = ["", "", "", ""] + atLocation = nil + begin + if !digestValue.empty? + # digest is of the format - repo@sha256:imageid + atLocation = digestValue.index("@") + if !atLocation.nil? + result[3] = digestValue[(atLocation + 1)..-1] + end end - # This method splits the tag value into an array - repository, image and tag - def getImageRepositoryImageTag(tagValue) - result = ["", "", ""] - begin - if !tagValue.empty? - # Find delimiters in the string of format repository/image:imagetag - slashLocation = tagValue.index('/') - colonLocation = tagValue.index(':') - if !colonLocation.nil? - if slashLocation.nil? - # image:imagetag - result[1] = tagValue[0..(colonLocation-1)] - else - # repository/image:imagetag - result[0] = tagValue[0..(slashLocation-1)] - result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)] - end - result[2] = tagValue[(colonLocation + 1)..-1] - end - end - rescue => errorStr - $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}") + if !tagValue.empty? + # Find delimiters in the string of format repository/image:imagetag + slashLocation = tagValue.index("/") + colonLocation = tagValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + result[1] = tagValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + result[0] = tagValue[0..(slashLocation - 1)] + result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)] end - return result + result[2] = tagValue[(colonLocation + 1)..-1] + end + elsif !digestValue.empty? + # Getting repo information from repodigests when repotags is empty + if !atLocation.nil? + result[0] = digestValue[0..(atLocation - 1)] + end end + rescue => errorStr + $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end - # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag - def getImageIdMap() - result = nil - begin - request = DockerApiRestHelper.restDockerImages - images = getResponse(request, true, false) - if !images.nil? && !images.empty? - result = {} - images.each do |image| - tagValue = "" - tags = image['RepoTags'] - if !tags.nil? && tags.kind_of?(Array) && tags.length > 0 - tagValue = tags[0] - end - idValue = image['Id'] - if !idValue.nil? - result[idValue] = getImageRepositoryImageTag(tagValue) - end - end - end - rescue => errorStr - $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}") + # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag + def getImageIdMap() + result = nil + begin + request = DockerApiRestHelper.restDockerImages + images = getResponse(request, true, false) + if !images.nil? && !images.empty? + result = {} + images.each do |image| + tagValue = "" + tags = image["RepoTags"] + if !tags.nil? && tags.kind_of?(Array) && tags.length > 0 + tagValue = tags[0] + end + digestValue = "" + digests = image["RepoDigests"] + if !digests.nil? && digests.kind_of?(Array) && digests.length > 0 + digestValue = digests[0] + end + idValue = image["Id"] + if !idValue.nil? + result[idValue] = getImageRepositoryImageTag(tagValue, digestValue) end - return result + end end + rescue => errorStr + $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end - def dockerInspectContainer(id) - request = DockerApiRestHelper.restDockerInspect(id) - return getResponse(request, false, false) - end + def dockerInspectContainer(id) + request = DockerApiRestHelper.restDockerInspect(id) + return getResponse(request, false, false) + end - # This method returns docker version and docker api version for telemetry - def dockerInfo() - request = DockerApiRestHelper.restDockerVersion - response = getResponse(request, false, true) - dockerInfo = {} - if (response != nil) - dockerInfo['Version'] = response['Version'] - dockerInfo['ApiVersion'] = response['ApiVersion'] - end - return dockerInfo - end + # This method returns docker version and docker api version for telemetry + def dockerInfo() + request = DockerApiRestHelper.restDockerVersion + response = getResponse(request, false, true) + dockerInfo = {} + if (response != nil) + dockerInfo["Version"] = response["Version"] + dockerInfo["ApiVersion"] = response["ApiVersion"] + end + return dockerInfo end + end end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 3c6b4f203..58a276cfd 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -57,7 +57,7 @@ def getKubeResourceInfo(resource) rescue => error @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") end - if (response.body.empty?) + if (!response.nil? && !response.body.nil? && response.body.empty?) @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") end return response diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index 05e5bc9ea..4392de280 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -170,12 +170,13 @@ def inspectContainer(id, nameMap, clusterCollectEnvironmentVar) end imageValue = container["Image"] if !imageValue.nil? && !imageValue.empty? - containerInstance["ImageId"] = imageValue repoImageTagArray = nameMap[imageValue] if nameMap.has_key? imageValue containerInstance["Repository"] = repoImageTagArray[0] containerInstance["Image"] = repoImageTagArray[1] containerInstance["ImageTag"] = repoImageTagArray[2] + # Setting the image id to the id in the remote repository + containerInstance["ImageId"] = repoImageTagArray[3] end end obtainContainerConfig(containerInstance, container, clusterCollectEnvironmentVar) @@ -200,7 +201,7 @@ def enumerate if !containerIds.empty? eventStream = MultiEventStream.new nameMap = DockerApiClient.getImageIdMap - clusterCollectEnvironmentVar = ENV['AZMON_CLUSTER_COLLECT_ENV_VAR'] + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 $log.warn("Environment Variable collection disabled for cluster") end diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 309dd8034..3a0e04c67 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -2,27 +2,25 @@ # frozen_string_literal: true module Fluent - class Kube_Event_Input < Input - Plugin.register_input('kubeevents', self) + Plugin.register_input("kubeevents", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" def initialize super - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - require_relative 'ApplicationInsightsUtility' + require "json" + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" - def configure (conf) + def configure(conf) super end @@ -46,63 +44,62 @@ def shutdown end def enumerate(eventList = nil) - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - if eventList.nil? - $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") - events = JSON.parse(KubernetesApiClient.getKubeResourceInfo('events').body) - $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") - else - events = eventList + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + if eventList.nil? + $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") + events = JSON.parse(KubernetesApiClient.getKubeResourceInfo("events").body) + $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") + else + events = eventList + end + eventQueryState = getEventQueryState + newEventQueryState = [] + begin + if (!events.empty? && !events["items"].nil?) + eventStream = MultiEventStream.new + events["items"].each do |items| + record = {} + # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + eventId = items["metadata"]["uid"] + "/" + items["count"].to_s + newEventQueryState.push(eventId) + if !eventQueryState.empty? && eventQueryState.include?(eventId) + next + end + record["ObjectKind"] = items["involvedObject"]["kind"] + record["Namespace"] = items["involvedObject"]["namespace"] + record["Name"] = items["involvedObject"]["name"] + record["Reason"] = items["reason"] + record["Message"] = items["message"] + record["Type"] = items["type"] + record["TimeGenerated"] = items["metadata"]["creationTimestamp"] + record["SourceComponent"] = items["source"]["component"] + record["FirstSeen"] = items["firstTimestamp"] + record["LastSeen"] = items["lastTimestamp"] + record["Count"] = items["count"] + if items["source"].key?("host") + record["Computer"] = items["source"]["host"] + else + record["Computer"] = (OMS::Common.get_hostname) + end + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + wrapper = { + "DataType" => "KUBE_EVENTS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper end - eventQueryState = getEventQueryState - newEventQueryState = [] - begin - if(!events.empty?) - eventStream = MultiEventStream.new - events['items'].each do |items| - record = {} - # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - eventId = items['metadata']['uid'] + "/" + items['count'].to_s - newEventQueryState.push(eventId) - if !eventQueryState.empty? && eventQueryState.include?(eventId) - next - end - record['ObjectKind']= items['involvedObject']['kind'] - record['Namespace'] = items['involvedObject']['namespace'] - record['Name'] = items['involvedObject']['name'] - record['Reason'] = items['reason'] - record['Message'] = items['message'] - record['Type'] = items['type'] - record['TimeGenerated'] = items['metadata']['creationTimestamp'] - record['SourceComponent'] = items['source']['component'] - record['FirstSeen'] = items['firstTimestamp'] - record['LastSeen'] = items['lastTimestamp'] - record['Count'] = items['count'] - if items['source'].key?('host') - record['Computer'] = items['source']['host'] - else - record['Computer'] = (OMS::Common.get_hostname) - end - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterId'] = KubernetesApiClient.getClusterId - wrapper = { - "DataType"=>"KUBE_EVENTS_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - end - router.emit_stream(@tag, eventStream) if eventStream - end - writeEventQueryState(newEventQueryState) - rescue => errorStr - $log.warn line.dump, error: errorStr.to_s - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + router.emit_stream(@tag, eventStream) if eventStream + end + writeEventQueryState(newEventQueryState) + rescue => errorStr + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def run_periodic @@ -135,7 +132,7 @@ def getEventQueryState eventQueryState.push(line.chomp) #puts will append newline which needs to be removed end end - rescue => errorStr + rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -145,20 +142,17 @@ def getEventQueryState def writeEventQueryState(eventQueryState) begin - if(!eventQueryState.nil? && !eventQueryState.empty?) + if (!eventQueryState.nil? && !eventQueryState.empty?) # No need to close file handle (f) due to block scope File.open(@@KubeEventsStateFile, "w") do |f| f.puts(eventQueryState) end end - rescue => errorStr + rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end - end # Kube_Event_Input - end # module - diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index aabda441e..0310fa419 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -58,81 +58,83 @@ def enumerate if (!nodeInventory.empty?) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new - #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] + if !nodeInventory["items"].nil? + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end + end + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + if !allNodeConditions.empty? + record["Status"] = allNodeConditions end end - if !allNodeConditions.empty? - record["Status"] = allNodeConditions - end - end - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - dockerVersion = nodeInfo["containerRuntimeVersion"] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord["DockerVersion"] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - properties["DockerVersion"] = dockerVersion - capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - telemetrySent = true + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + telemetrySent = true + end end end router.emit_stream(@tag, eventStream) if eventStream diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 79490ba7d..9c5fef1d7 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -152,8 +152,10 @@ def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) containerEnvArray.each do |envVarHash| envName = envVarHash["name"] envValue = envVarHash["value"] - envArrayElement = envName + "=" + envValue - envVarsArray.push(envArrayElement) + if !envName.nil? && !envValue.nil? + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end end end # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE @@ -201,7 +203,9 @@ def parse_and_emit_records(podInventory, serviceList) # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] + if !items["metadata"]["annotations"].nil? + podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] + end else podUid = items["metadata"]["uid"] end @@ -287,7 +291,9 @@ def parse_and_emit_records(podInventory, serviceList) record["ContainerID"] = "" end #keeping this as which is same as InstanceName in perf table - record["ContainerName"] = podUid + "/" + container["name"] + if !podUid.nil? && !container["name"].nil? + record["ContainerName"] = podUid + "/" + container["name"] + end #Pod restart count is a sumtotal of restart counts of individual containers #within the pod. The restart count of a container is maintained by kubernetes #itself in the form of a container label. diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb index e1bb93f30..8b0a013e4 100644 --- a/source/code/plugin/in_kube_services.rb +++ b/source/code/plugin/in_kube_services.rb @@ -2,108 +2,101 @@ # frozen_string_literal: true module Fluent - - class Kube_Services_Input < Input - Plugin.register_input('kubeservices', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - require_relative 'ApplicationInsightsUtility' + class Kube_Services_Input < Input + Plugin.register_input("kubeservices", self) - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.containerinsights.KubeServices" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal + def initialize + super + require "yaml" + require "json" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.KubeServices" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + begin + if (!serviceList.empty?) + eventStream = MultiEventStream.new + serviceList["items"].each do |items| + record = {} + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["ServiceName"] = items["metadata"]["name"] + record["Namespace"] = items["metadata"]["namespace"] + record["SelectorLabels"] = [items["spec"]["selector"]] + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterIP"] = items["spec"]["clusterIP"] + record["ServiceType"] = items["spec"]["type"] + # : Add ports and status fields + wrapper = { + "DataType" => "KUBE_SERVICES_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } - @thread.join + eventStream.add(emitTime, wrapper) if wrapper end + router.emit_stream(@tag, eventStream) if eventStream end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo('services').body) - $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - begin - if(!serviceList.empty?) - eventStream = MultiEventStream.new - serviceList['items'].each do |items| - record = {} - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['ServiceName'] = items['metadata']['name'] - record['Namespace'] = items['metadata']['namespace'] - record['SelectorLabels'] = [items['spec']['selector']] - record['ClusterId'] = KubernetesApiClient.getClusterId - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterIP'] = items['spec']['clusterIP'] - record['ServiceType'] = items['spec']['type'] - # : Add ports and status fields - wrapper = { - "DataType"=>"KUBE_SERVICES_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - end - router.emit_stream(@tag, eventStream) if eventStream - end - rescue => errorStr - $log.warn line.dump, error: errorStr.to_s - $log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_services::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - @mutex.lock + rescue => errorStr + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_services::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - @mutex.unlock end - - end # Kube_Services_Input - - end # module - - \ No newline at end of file + @mutex.lock + end + @mutex.unlock + end + end # Kube_Services_Input +end # module From 4f87b582f19083587ce34cea7af50108f6e2d105 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 9 Jul 2019 13:30:56 -0700 Subject: [PATCH 10/12] changes --- installer/conf/telegraf-rs.conf | 26 ++++++++++---------------- installer/conf/telegraf.conf | 11 ----------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index 53aa03620..ce60bfa04 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -538,15 +538,20 @@ #Prometheus Custom Metrics [[inputs.prometheus]] - #name_prefix="container.azm.ms/" - ## An array of urls to scrape metrics from. - #urls = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "http://$NODE_IP:10254/metrics", "http://$NODE_IP:9100/metrics"] - #fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] interval = "$AZMON_RS_PROM_INTERVAL" + ## An array of urls to scrape metrics from. urls = $AZMON_RS_PROM_URLS - + + ## An array of Kubernetes services to scrape metrics from. kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS fieldpass = $AZMON_RS_PROM_FIELDPASS @@ -555,20 +560,9 @@ metric_version = 2 url_tag = "scrapeUrl" - ## An array of Kubernetes services to scrape metrics from. - #kubernetes_services = ["https://kube-state-metrics.monitoring:8443/metrics","https://kube-state-metrics.monitoring:9443/metrics","http://oce-scc-template-nginx-ingress-controller.oce-nginx:10254/metrics"] - ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" - ## Scrape Kubernetes pods for the following prometheus annotations: - ## - prometheus.io/scrape: Enable scraping for this pod - ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to - ## set this to `https` & most likely set the tls config. - ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. - ## - prometheus.io/port: If port is not 9102 use this annotation - # monitor_kubernetes_pods = true - ## Use bearer token for authorization. ('bearer_token' takes priority) bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" ## OR diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 47e71c5f5..4883de81b 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -583,20 +583,9 @@ metric_version = 2 url_tag = "scrapeUrl" - ## An array of Kubernetes services to scrape metrics from. - #kubernetes_services = ["http://$NODE_IP:10255/metrics", "http://$NODE_IP:10255/metrics/cadvisor", "https://$NODE_IP:9100/metrics"] - ## Kubernetes config file to create client from. # kube_config = "/path/to/kubernetes.config" - ## Scrape Kubernetes pods for the following prometheus annotations: - ## - prometheus.io/scrape: Enable scraping for this pod - ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to - ## set this to `https` & most likely set the tls config. - ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. - ## - prometheus.io/port: If port is not 9102 use this annotation - # monitor_kubernetes_pods = true - ## Use bearer token for authorization. ('bearer_token' takes priority) bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" ## OR From 574ad1cdc91a98610485bc6976a4bd470a8a91a9 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 9 Jul 2019 15:09:44 -0700 Subject: [PATCH 11/12] changes for poduid nil check --- source/code/plugin/in_kube_podinventory.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 9c5fef1d7..d0056fb14 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -203,7 +203,9 @@ def parse_and_emit_records(podInventory, serviceList) # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - if !items["metadata"]["annotations"].nil? + if items["metadata"]["annotations"].nil? + next + else podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] end else @@ -291,7 +293,9 @@ def parse_and_emit_records(podInventory, serviceList) record["ContainerID"] = "" end #keeping this as which is same as InstanceName in perf table - if !podUid.nil? && !container["name"].nil? + if podUid.nil? || container["name"].nil? + next + else record["ContainerName"] = podUid + "/" + container["name"] end #Pod restart count is a sumtotal of restart counts of individual containers From 5c8c3922935ae1c7a98f21576b4a0277a08a80c7 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 10 Jul 2019 13:34:43 -0700 Subject: [PATCH 12/12] removing buffer chunk size and buffer max size from fluentbit conf --- installer/conf/td-agent-bit.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index e7aabd242..ab79710c7 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -12,8 +12,6 @@ DB.Sync Off Parser docker Mem_Buf_Limit 10m - Buffer_Chunk_Size 1m - Buffer_Max_Size 1m Rotate_Wait 20 Refresh_Interval 30 Path_Key filepath