diff --git a/README.md b/README.md index 3564345ee..555234c61 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ The general directory structure is: │ │ ├── acrworkflows/ - acr work flows for the Linux Agent container image │ │ ├── defaultpromenvvariables - default environment variables for Prometheus scraping │ │ ├── defaultpromenvvariables-rs - cluster level default environment variables for Prometheus scraping +│ │ ├── defaultpromenvvariables-sidecar - cluster level default environment variables for Prometheus scraping in sidecar │ ├── windows/ - scripts to build the Docker image for Windows Agent │ │ ├── dockerbuild - script to build the code and docker imag, and publish docker image │ │ ├── acrworkflows/ - acr work flows for the Windows Agent container image diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 80d6f188d..01fb861cf 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -102,6 +102,114 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Telemetry fix for agent telemetry for sov. clouds +### 03/26/2021 - +##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) +##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) +##### Code change log +- Started collecting new metric - kubelet running pods count +- Onboarding script fixes to add explicit json output +- Proxy and token updates for ARC +- Doc updates for Microsoft charts repo release +- Bug fixes for trailing whitespaces in enable-monitoring.sh script +- Support for higher volume of prometheus metrics by scraping metrics from sidecar +- Update to get new version of telegraf - 1.18 +- Add label and field selectors for prometheus scraping using annotations +- Support for OSM integration +- Removed wireserver calls to get CA certs since access is removed +- Added liveness timeout for exec for linux containers + +### 02/23/2021 - +##### Version microsoft/oms:ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021 (linux) +##### Version microsoft/oms:win-ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021 (windows) +##### Code change log +- ContainerLogV2 schema support for LogAnalytics & ADX (not usable externally yet) +- Fix nodemetrics (cpuusageprecentage & memoryusagepercentage) metrics not flowing. This is fixed upstream for k8s versions >= 1.19.7 and >=1.20.2. +- Fix cpu & memory usage exceeded threshold container metrics not flowing when requests and/or limits were not set +- Mute some unused exceptions from going to telemetry +- Collect containerimage (repository, image & imagetag) from spec (instead of runtime) +- Add support for extension MSI for k8s arc +- Use cloud specific instrumentation keys for telemetry +- Picked up newer version for apt +- Add priority class to daemonset (in our chart only) + +### 01/11/2021 - +##### Version microsoft/oms:ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021 (linux) +##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) +##### Code change log +- Fixes for Linux Agent Replicaset Pod OOMing issue +- Update fluentbit (1.14.2 to 1.6.8) for the Linux Daemonset +- Make Fluentbit settings: log_flush_interval_secs, tail_buf_chunksize_megabytes and tail_buf_maxsize_megabytes configurable via configmap +- Support for PV inventory collection +- Removal of Custom metric region check for Public cloud regions and update to use cloud environment variable to determine the custom metric support +- For daemonset pods, add the dnsconfig to use ndots: 3 from ndots:5 to optimize the number of DNS API calls made +- Fix for inconsistency in the collection container environment variables for the pods which has high number of containers +- Fix for disabling of std{out;err} log_collection_settings via configmap issue in windows daemonset +- Update to use workspace key from mount file rather than environment variable for windows daemonset agent +- Remove per container info logs in the container inventory +- Enable ADX route for windows container logs +- Remove logging to termination log in windows agent liveness probe + +### 11/09/2020 - +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) +##### Code change log +- Fix for duplicate windows metrics + +### 10/27/2020 - +##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) +##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020 (windows) +##### Code change log +- Activate oneagent in few AKS regions (koreacentral,norwayeast) +- Disable syslog +- Fix timeout for Windows daemonset liveness probe +- Make request == limit for Windows daemonset resources (cpu & memory) +- Schema v2 for container log (ADX only - applicable only for select customers for piloting) + +### 10/05/2020 - +##### Version microsoft/oms:ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020 (linux) +##### Version microsoft/oms:win-ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Code change log +- Health CRD to version v1 (from v1beta1) for k8s versions >= 1.19.0 +- Collection of PV usage metrics for PVs mounted by pods (kube-system pods excluded by default)(doc-link-needed) +- Zero fill few custom metrics under a timer, also add zero filling for new PV usage metrics +- Collection of additional Kubelet metrics ('kubelet_running_pod_count','volume_manager_total_volumes','kubelet_node_config_error','process_resident_memory_bytes','process_cpu_seconds_total','kubelet_runtime_operations_total','kubelet_runtime_operations_errors_total'). This also includes updates to 'kubelet' workbook to include these new metrics +- Collection of Azure NPM (Network Policy Manager) metrics (basic & advanced. By default, NPM metrics collection is turned OFF)(doc-link-needed) +- Support log collection when docker root is changed with knode. Tracked by [this](https://github.com/Azure/AKS/issues/1373) issue +- Support for Pods in 'Terminating' state for nodelost scenarios +- Fix for reduction in telemetry for custom metrics ingestion failures +- Fix CPU capacity/limits metrics being 0 for Virtual nodes (VK) +- Add new custom metric regions (eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth) +- Enable strict SSL validation for AppInsights Ruby SDK +- Turn off custom metrics upload for unsupported cluster types +- Install CA certs from wire server for windows (in certain clouds) + +### 09/16/2020 - +> Note: This agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update +##### Version microsoft/oms:ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020 (linux) +##### Version microsoft/oms:win-ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09162020 (windows) +##### Code change log +- Collection of Azure Network Policy Manager Basic and Advanced metrics +- Add support in Windows Agent for Container log collection of CRI runtimes such as ContainerD +- Alertable metrics support Arc K8s cluster to parity with AKS +- Support for multiple container log mount paths when docker is updated through knode +- Bug fix related to MDM telemetry + +### 08/07/2020 - +##### Version microsoft/oms:ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020 (linux) +##### Version microsoft/oms:win-ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020 (windows) +##### Code change log +- Collection of KubeState metrics for deployments and HPA +- Add the Proxy support for Windows agent +- Fix for ContainerState in ContainerInventory to handle Failed state and collection of environment variables for terminated and failed containers +- Change /spec to /metrics/cadvisor endpoint to collect node capacity metrics +- Disable Health Plugin by default and can enabled via configmap +- Pin version of jq to 1.5+dfsg-2 +- Bug fix for showing node as 'not ready' when there is disk pressure +- oneagent integration (disabled by default) +- Add region check before sending alertable metrics to MDM +- Telemetry fix for agent telemetry for sov. clouds + + ### 07/15/2020 - ##### Version microsoft/oms:ciprod07152020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020 (linux) ##### Version microsoft/oms:win-ciprod05262020-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod05262020-2 (windows) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 2a3e6001a..c6f51bb65 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -43,13 +43,47 @@ This needs to be co-ordinated with Red hat and ARO-RP team for the release and Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## ARO v4, On-prem K8s, Azure Arc K8s and OpenShift v4 clusters +## ARO v4, Azure Arc K8s and OpenShift v4 clusters Make sure azuremonitor-containers chart yamls updates with all changes going with the release and also make sure to bump the chart version, imagetag and docker provider version etc. Similar to agent container image, build pipeline automatically push the chart to container insights prod acr for canary and prod repos accordingly. Both the agent and helm chart will be replicated to `mcr.microsoft.com`. The way, customers will be onboard the monitoring to these clusters using onboarding scripts under `onboarding\managed` directory so please bump chart version for prod release. Once we move to Arc K8s Monitoring extension Public preview, these will be taken care so at that point of time no manual changes like this required. +## Microsoft Charts Repo release for On-prem K8s + +Since HELM charts repo being deprecated, Microsoft charts repo being used for HELM chart release of on-prem K8s clusters. +To make chart release PR, fork [Microsoft-charts-repo]([https://github.com/microsoft/charts/tree/gh-pages) and make the PR against `gh-pages` branch of the upstream repo. + +Refer PR - https://github.com/microsoft/charts/pull/23 for example. +Once the PR merged, latest version of HELM chart should be available in couple of mins in https://microsoft.github.io/charts/repo and https://artifacthub.io/. + +Instructions to create PR +``` +# 1. create helm package for the release candidate + git clone git@github.com:microsoft/Docker-Provider.git + git checkout ci_prod + cd ~/Docker-Provider/charts/azuremonitor-containers # this path based on where you have cloned the repo + helm package . + +# 2. clone your fork repo and checkout gh_pages branch # gh_pages branch used as release branch + cd ~ + git clone + cd ~/charts # assumed the root dir of the clone is charts + git checkout gh_pages + +# 3. copy release candidate helm package + cd ~/charts/repo/azuremonitor-containers + # update chart version value with the version of chart being released + cp ~/Docker-Provider/charts/azuremonitor-containers/azuremonitor-containers-.tgz . + cd ~/charts/repo + # update repo index file + helm repo index . + +# 4. Review the changes and make PR. Please note, you may need to revert unrelated changes automatically added by `helm repo index .` command + +``` + # 4. Monitor agent roll-out status In Container Insights Agent (AKS) telemetry dashboard, update the agent roll status by region chart with released agent image and track rollout status. If you see any issues with agent rollout, reach out AKS on-call team for the help on investigation and understanding whats going on. diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..819c1956f --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,423 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end +# require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require "fileutils" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@promSideCar = "prometheussidecar" +@windows = "windows" +@configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +# @defaultRsMonitorPods = false +@defaultCustomPrometheusInterval = "1m" +@defaultCustomPrometheusFieldPass = [] +@defaultCustomPrometheusFieldDrop = [] +@defaultCustomPrometheusMonitorPods = false +@defaultCustomPrometheusLabelSelectors = "" +@defaultCustomPrometheusFieldSelectors = "" + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Checking to see if this is the daemonset or replicaset to parse config accordingly +@controller = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +def checkForType(variable, varType) + if variable.nil? || variable.kind_of?(varType) + return true + else + return false + end +end + +def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with no namespace filters" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", ("pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", ("kubernetes_label_selector = \"#{kubernetesLabelSelectors}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", ("kubernetes_field_selector = \"#{kubernetesFieldSelectors}\"")) + rescue => errorStr + puts "Exception while replacing default pod monitor settings for custom prometheus scraping: #{errorStr}" + end + return new_contents +end + +def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with namespace filters" + + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE") + + pluginConfigsWithNamespaces = "" + monitorKubernetesPodsNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + interval = \"#{interval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + kubernetes_label_selector = \"#{kubernetesLabelSelectors}\" + kubernetes_field_selector = \"#{kubernetesFieldSelectors}\" + fieldpass = #{fieldPassSetting} + fielddrop = #{fieldDropSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) + return new_contents + rescue => errorStr + puts "Exception while creating prometheus input plugins to filter namespaces for custom prometheus: #{errorStr}, using defaults" + replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + if !@controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if @controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + + # Remove below 4 lines after phased rollout + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datatypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + # Remove below check after phased rollout + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) # Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + # Remove below lines after phased rollout + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + end + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + end + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for replicaset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") + setRsPromDefaults + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && + ((!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) || + (!@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0) && @sidecarScrapingEnabled.strip.casecmp("true") == 0) && + !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus custom config settings for monitor kubernetes pods + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for custom prometheus scraping" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultCustomPrometheusInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultCustomPrometheusFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultCustomPrometheusFieldDrop : fieldDrop + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultCustomPrometheusMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + file_name = "/etc/telegraf/telegraf.conf" + else + file_name = "/opt/telegraf-test-prom-side-car.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf", file_name) + end + puts "config::Starting to substitute the placeholders in telegraf conf copy file for linux or conf file for windows for custom prometheus scraping" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" + #Set environment variables for telemetry in the sidecar container + if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for prometheus sidecar" + end + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for prometheus side car, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for promethues side car: #{errorStr}, using defaults") + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for daemonset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/build/linux/installer/conf/prometheus-side-car.conf b/build/linux/installer/conf/prometheus-side-car.conf new file mode 100644 index 000000000..fd40910d9 --- /dev/null +++ b/build/linux/installer/conf/prometheus-side-car.conf @@ -0,0 +1,4 @@ + + + + diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf new file mode 100644 index 000000000..720f54820 --- /dev/null +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -0,0 +1,28 @@ +[SERVICE] + #Default service flush interval is 15 seconds + Flush 15 + HTTP_Server Off + Daemon Off + storage.path /var/opt/microsoft/docker-cimprov/state/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 1m + Buffer_Size 1m + Mem_Buf_Limit 20m + +[OUTPUT] + Name oms + EnableTelemetry true + Retry_Limit 10 + TelemetryPushIntervalSeconds 300 + Match oms.container.* \ No newline at end of file diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf new file mode 100644 index 000000000..b3b4ba1d3 --- /dev/null +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 3000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 60000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index d81196330..ee1cf8819 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -540,13 +540,13 @@ #Prometheus Custom Metrics [[inputs.prometheus]] - interval = "$AZMON_RS_PROM_INTERVAL" + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = $AZMON_RS_PROM_URLS + urls = $AZMON_TELEGRAF_CUSTOM_PROM_URLS ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + kubernetes_services = $AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES ## Scrape Kubernetes pods for the following prometheus annotations: ## - prometheus.io/scrape: Enable scraping for this pod @@ -554,10 +554,15 @@ ## set this to `https` & most likely set the tls config. ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. ## - prometheus.io/port: If port is not 9102 use this annotation - $AZMON_RS_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS - fieldpass = $AZMON_RS_PROM_FIELDPASS - fielddrop = $AZMON_RS_PROM_FIELDDROP + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -581,7 +586,11 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS + # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 202ac9741..5a5bb2d8c 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -675,7 +675,9 @@ ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - fieldpass = ["kubelet_running_pod_count","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] + # <= 1.18: metric name is kubelet_running_pod_count + # >= 1.19: metric name changed to kubelet_running_pods + fieldpass = ["kubelet_running_pod_count","kubelet_running_pods","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] metric_version = 2 url_tag = "scrapeUrl" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index c680f0eea..df8fbc3da 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -110,24 +110,28 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlrb/string_utils.rb; source/toml-parser/tomlrb/string_utils.rb; 644; root; root /opt/tomlrb/version.rb; source/toml-parser/tomlrb/version.rb; 644; root; root -/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root -/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root -/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root +/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root +/etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf; build/linux/installer/conf/prometheus-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf; build/linux/installer/conf/telegraf-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root +/opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root +/opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index e3f9fb475..a82fa28eb 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -26,15 +26,22 @@ then exit 1 fi -if [ ! -s "inotifyoutput.txt" ] +if [ -s "inotifyoutput.txt" ] then - # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded - exit 0 -else - if [ -s "inotifyoutput.txt" ] - then - # inotifyoutput file has data(config map was applied) - echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log - exit 1 - fi + # inotifyoutput file has data(config map was applied) + echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log + exit 1 fi + +# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then + if [ -s "inotifyoutput-osm.txt" ] + then + # inotifyoutput-osm file has data(config map was applied) + echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log + exit 1 + fi +fi + +exit 0 diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb new file mode 100644 index 000000000..096064db8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -0,0 +1,168 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" +require "fileutils" +require_relative "ConfigParseErrorLogger" + +@controllerType = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +@replicaset = "replicaset" +@prometheusSidecar = "prometheussidecar" + +if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@replicaset) == 0 && + (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && !@sidecarScrapingEnabled.empty? && @sidecarScrapingEnabled.strip.casecmp("false") == 0)) + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + @tgfTestConfigFile = "/opt/telegraf-test-rs.conf" +elsif !@containerType.nil? && !@containerType.empty? && @containerType.strip.casecmp(@prometheusSidecar) == 0 + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +end + +@configMapMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" +@configSchemaVersion = "" +# @tgfConfigFileSidecar = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" +# @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +@osmMetricNamespaces = [] + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +#@fieldPassSetting = "[\"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq\"]" +@fieldPassSetting = "[\"envoy_cluster_upstream_cx_total\", \"envoy_cluster_upstream_cx_connect_fail\", \"envoy_cluster_upstream_rq\", \"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq_total\", \"envoy_cluster_upstream_rq_time_bucket\", \"envoy_cluster_upstream_cx_rx_bytes_total\", \"envoy_cluster_upstream_cx_tx_bytes_total\", \"envoy_cluster_upstream_cx_active\"]" +@scrapeInterval = "1m" +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-osmconfig for osm metrics found, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map for osm metrics" + return parsedConfig + else + puts "config::configmap container-azm-ms-osmconfig for osm metrics not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for osm metrics: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && + !parsedConfig[:osm_metric_collection_configuration].nil? && + !parsedConfig[:osm_metric_collection_configuration][:settings].nil? + osmPromMetricNamespaces = parsedConfig[:osm_metric_collection_configuration][:settings][:monitor_namespaces] + puts "config::osm::got:osm_metric_collection_configuration.settings.monitor_namespaces='#{osmPromMetricNamespaces}'" + + # Check to see if osm_metric_collection_configuration.settings has a valid setting for monitor_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + if !osmPromMetricNamespaces.nil? && checkForTypeArray(osmPromMetricNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (osmPromMetricNamespaces.length > 0) + @osmMetricNamespaces = osmPromMetricNamespaces + end + end + end + rescue => errorStr + puts "config::osm::error:Exception while reading config settings for osm configuration settings - #{errorStr}, using defaults" + @osmMetricNamespaces = [] + end +end + +def replaceOsmTelegrafConfigPlaceHolders + begin + #replace place holders in configuration file + tgfConfig = File.read(@tgfTestConfigFile) #read returns only after closing the file + + if @osmMetricNamespaces.length > 0 + osmPluginConfigsWithNamespaces = "" + @osmMetricNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + osmPluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + name_prefix=\"container.azm.ms.osm/\" + interval = \"#{@scrapeInterval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controllerType.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + fieldpass = #{@fieldPassSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", osmPluginConfigsWithNamespaces) + else + puts "Using defaults for OSM configuration since there was an error in OSM config map or no namespaces were set" + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", "") + end + File.open(@tgfTestConfigFile, "w") { |file| file.puts tgfConfig } # 'file' will be closed here after it goes out of scope + puts "config::osm::Successfully substituted the OSM placeholders in #{@tgfTestConfigFile} file in sidecar container" + rescue => errorStr + # TODO: test this scenario out + puts "config::osm::error:Exception while replacing telegraf configuration settings for osm - #{errorStr}, using defaults" + end +end + +@osmConfigSchemaVersion = ENV["AZMON_OSM_CFG_SCHEMA_VERSION"] +puts "****************Start OSM Config Processing********************" +if !@osmConfigSchemaVersion.nil? && !@osmConfigSchemaVersion.empty? && @osmConfigSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + # Check to see if the prometheus custom config parser has created a test config file so that we can replace the settings in the test file and run it, If not create + # a test config file by copying contents of the actual telegraf config file. + if (!File.exist?(@tgfTestConfigFile)) + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + puts "test telegraf config file #{@tgfTestConfigFile} does not exist, creating new one" + FileUtils.cp(@tgfConfigFile, @tgfTestConfigFile) + end + + replaceOsmTelegrafConfigPlaceHolders() + + # Write the telemetry to file, so that they can be set as environment variables + telemetryFile = File.open("integration_osm_config_env_var", "w") + + if !telemetryFile.nil? + telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + # Close file after writing all environment variables + telemetryFile.close + else + puts "config::osm::Exception while opening file for writing OSM telemetry environment variables" + end + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::osm::unsupported/missing config schema version - '#{@osmConfigSchemaVersion}' , using defaults, please use supported schema version") + else + puts "config::No configmap mounted for OSM config, using defaults" + end +end +puts "****************End OSM Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb b/build/linux/installer/scripts/tomlparser-prom-customconfig.rb deleted file mode 100644 index 7aad580ee..000000000 --- a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/local/bin/ruby - -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" -require "fileutils" - -@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" -@replicaset = "replicaset" -@daemonset = "daemonset" -@configSchemaVersion = "" -@defaultDsInterval = "1m" -@defaultDsPromUrls = [] -@defaultDsFieldPass = [] -@defaultDsFieldDrop = [] -@defaultRsInterval = "1m" -@defaultRsPromUrls = [] -@defaultRsFieldPass = [] -@defaultRsFieldDrop = [] -@defaultRsK8sServices = [] -@defaultRsMonitorPods = false - -#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering -@metricVersion = 2 -@urlTag = "scrapeUrl" -@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" -@responseTimeout = "15s" -@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" -@insecureSkipVerify = true - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@promConfigMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" - parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted prometheus config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -def checkForTypeArray(arrayValue, arrayType) - if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) - return true - else - return false - end -end - -def checkForType(variable, varType) - if variable.nil? || variable.kind_of?(varType) - return true - else - return false - end -end - -def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") - rescue => errorStr - puts "Exception while replacing default pod monitor settings: #{errorStr}" - end - return new_contents -end - -def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_RS_PROM_MONITOR_PODS") - pluginConfigsWithNamespaces = "" - monitorKubernetesPodsNamespaces.each do |namespace| - if !namespace.nil? - #Stripping namespaces to remove leading and trailing whitespaces - namespace.strip! - if namespace.length > 0 - pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] - interval = \"#{interval}\" - monitor_kubernetes_pods = true - monitor_kubernetes_pods_namespace = \"#{namespace}\" - fieldpass = #{fieldPassSetting} - fielddrop = #{fieldDropSetting} - metric_version = #{@metricVersion} - url_tag = \"#{@urlTag}\" - bearer_token = \"#{@bearerToken}\" - response_timeout = \"#{@responseTimeout}\" - tls_ca = \"#{@tlsCa}\" - insecure_skip_verify = #{@insecureSkipVerify}\n" - end - end - end - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) - return new_contents - rescue => errorStr - puts "Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" - replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - # Checking to see if this is the daemonset or replicaset to parse config accordingly - controller = ENV["CONTROLLER_TYPE"] - if !controller.nil? - if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? - if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? - #Get prometheus replicaset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] - kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] - monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] - monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(kubernetesServices, String) && - checkForTypeArray(urls, String) && - (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby - puts "config::Successfully passed typecheck for config settings for replicaset" - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultRsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop - kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices - urls = (urls.nil?) ? @defaultRsPromUrls : urls - monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods - - file_name = "/opt/telegraf-test-rs.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPassSetting) - fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDropSetting) - new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) - - # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces - # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - - # - to use defaults in case of nil settings - if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) - new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length - else - new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - monitorKubernetesPodsNamespacesLength = 0 - end - - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") - file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") - - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for replicaset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") - setRsPromDefaults - puts "****************End Prometheus Config Processing********************" - end - elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? - #Get prometheus daemonset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(urls, String) - puts "config::Successfully passed typecheck for config settings for daemonset" - - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultDsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop - urls = (urls.nil?) ? @defaultDsPromUrls : urls - - file_name = "/opt/telegraf-test.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" - - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for daemonset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") - puts "****************End Prometheus Config Processing********************" - end - end # end of controller type check - end - else - ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Prometheus Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@promConfigMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") - else - puts "config::No configmap mounted for prometheus custom config, using defaults" - end -end -puts "****************End Prometheus Config Processing********************" diff --git a/build/version b/build/version index 2da3efa39..83a0a174b 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=13 +CONTAINER_BUILDVERSION_MAJOR=14 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210223 +CONTAINER_BUILDVERSION_DATE=20210326 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 879ee4810..1eebe5fd6 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -12,6 +12,15 @@ Chunk_Size 32 Buffer_Size 64 +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m + [OUTPUT] Name oms EnableTelemetry true diff --git a/build/windows/installer/conf/telegraf.conf b/build/windows/installer/conf/telegraf.conf new file mode 100644 index 000000000..5f4d2364e --- /dev/null +++ b/build/windows/installer/conf/telegraf.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index ce64fd1ce..9c8014ed0 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.1 +version: 2.8.2 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 82d210f3d..8868b86bb 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -81,6 +81,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SIDECAR_SCRAPING_ENABLED + value: "false" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 615cd0485..7201ee6ae 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -131,6 +131,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.daemonset.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 012dd2720..fdc520cba 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -72,7 +72,9 @@ spec: value: {{ .Values.Azure.Extension.Name | quote }} {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" + - name: SIDECAR_SCRAPING_ENABLED + value: "false" - name: ISTEST value: {{ .Values.omsagent.ISTEST | quote }} securityContext: @@ -109,6 +111,9 @@ spec: - mountPath: /etc/config/settings/adx name: omsagent-adx-secret readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true livenessProbe: exec: command: @@ -117,6 +122,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.deployment.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} @@ -158,4 +164,8 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index 5db5c2dab..c0a6e3722 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -28,7 +28,7 @@ rules: resources: ["healthstates"] verbs: ["get", "create", "patch"] - apiGroups: ["clusterconfig.azure.com"] - resources: ["azureclusteridentityrequests"] + resources: ["azureclusteridentityrequests", "azureclusteridentityrequests/status"] resourceNames: ["container-insights-clusteridentityrequest"] verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml index 1a7f087ed..8c245338c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml @@ -13,7 +13,19 @@ data: WSID: {{ required "A valid workspace id is required!" .Values.omsagent.secret.wsid | b64enc | quote }} KEY: {{ required "A valid workspace key is required!" .Values.omsagent.secret.key | b64enc | quote }} DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }} - {{- if ne .Values.omsagent.proxy "" }} + {{- $httpsProxyDict := urlParse .Values.Azure.proxySettings.httpsProxy -}} + {{- $httpProxyDict := urlParse .Values.Azure.proxySettings.httpProxy -}} + {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpsProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpsProxyDict.scheme "userinfo" "admin:secret" "host" $httpsProxyDict.host) | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) ($httpProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpProxyDict.scheme "userinfo" "admin:secret" "host" $httpProxyDict.host) | b64enc | quote }} + {{- else if ne .Values.omsagent.proxy "" }} PROXY: {{ .Values.omsagent.proxy | b64enc | quote }} {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index afe789d56..fc2a3afaf 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,13 +12,19 @@ Azure: Extension: Name: "" ResourceId: "" + proxySettings: + isProxyEnabled: false + httpProxy: "" + httpsProxy: "" + noProxy: "" + proxyCert: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod02232021" - tagWindows: "win-ciprod02232021" + tag: "ciprod03262021" + tagWindows: "win-ciprod03262021" pullPolicy: IfNotPresent - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index aec1bb456..e38d9b4ab 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -76,6 +76,17 @@ data: ## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"] # monitor_kubernetes_pods_namespaces = ["default1"] + ## Label selector to target pods which have the specified label + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + # kubernetes_label_selector = "env=dev,app=nginx" + + ## Field selector to target pods which have the specified field + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + ## eg. To scrape pods on a specific node + # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" + [prometheus_data_collection_settings.node] # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. diff --git a/kubernetes/container-azm-ms-osmconfig.yaml b/kubernetes/container-azm-ms-osmconfig.yaml new file mode 100644 index 000000000..05b7ac3ed --- /dev/null +++ b/kubernetes/container-azm-ms-osmconfig.yaml @@ -0,0 +1,17 @@ +kind: ConfigMap +apiVersion: v1 +data: + schema-version: + #string.used by agent to parse OSM config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by OSM addon team to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + osm-metric-collection-configuration: |- + # OSM metric collection settings + [osm_metric_collection_configuration.settings] + # Namespaces to monitor + # monitor_namespaces = ["namespace1", "namespace2"] +metadata: + name: container-azm-ms-osmconfig + namespace: kube-system diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index bee718a31..76b8622b4 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod02232021 +ARG IMAGE_TAG=ciprod03262021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi @@ -17,7 +17,7 @@ ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image diff --git a/kubernetes/linux/defaultpromenvvariables-rs b/kubernetes/linux/defaultpromenvvariables-rs index 1346e62b9..920f4e90e 100644 --- a/kubernetes/linux/defaultpromenvvariables-rs +++ b/kubernetes/linux/defaultpromenvvariables-rs @@ -1,7 +1,12 @@ -export AZMON_RS_PROM_INTERVAL="1m" -export AZMON_RS_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" -export AZMON_RS_PROM_FIELDPASS="[]" -export AZMON_RS_PROM_FIELDDROP="[]" -export AZMON_RS_PROM_URLS="[]" -export AZMON_RS_PROM_K8S_SERVICES="[]" -export AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'cluster'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_URLS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" + diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar new file mode 100644 index 000000000..3301488d8 --- /dev/null +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -0,0 +1,9 @@ +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'node'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c4067f25e..71e46875b 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -2,7 +2,17 @@ if [ -e "/etc/config/kube.conf" ]; then cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf +elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "setting omsagent conf file for prometheus sidecar" + cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. + # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, + # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. + # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. + sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf + sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf else + echo "setting omsagent conf file for daemonset" sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf fi sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf @@ -28,6 +38,12 @@ sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +#Run inotify as a daemon to track changes to the mounted configmap for OSM settings. +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +fi + #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" @@ -68,6 +84,24 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi +#set OSM config schema version +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then + #trim + osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" + #remove all spaces + osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" + #take first 10 characters + osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + + export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + source ~/.bashrc + echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" + fi +fi + export PROXY_ENDPOINT="" # Check for internet connectivity or workspace deletion @@ -193,71 +227,58 @@ echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc source ~/.bashrc +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + #Parse the configmap to set the right environment variables. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb -#Parse the configmap to set the right environment variables. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb - -cat config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source config_env_var - + cat config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_env_var +fi #Parse the configmap to set the right environment variables for agent config. #Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb -cat agent_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source agent_config_env_var + cat agent_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source agent_config_env_var -#Parse the configmap to set the right environment variables for network policy manager (npm) integration. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + #Parse the configmap to set the right environment variables for network policy manager (npm) integration. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb -cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source integration_npm_config_env_var + cat integration_npm_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source integration_npm_config_env_var +fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb -#If config parsing was successful, a copy of the conf file with replaced custom settings file is created -if [ ! -e "/etc/config/kube.conf" ]; then - if [ -e "/opt/telegraf-test.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -else - if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -fi - #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then - cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc - done - source defaultpromenvvariables + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + cat defaultpromenvvariables-sidecar | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables-sidecar + else + cat defaultpromenvvariables | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables + fi else cat defaultpromenvvariables-rs | while read line; do echo $line >> ~/.bashrc @@ -273,21 +294,37 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb -cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_mdm_metrics_env_var + cat config_mdm_metrics_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_mdm_metrics_env_var -#Parse the configmap to set the right environment variables for metric collection settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + #Parse the configmap to set the right environment variables for metric collection settings + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb -cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_metric_collection_env_var + cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_metric_collection_env_var +fi + +# OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + + if [ -e "integration_osm_config_env_var" ]; then + cat integration_osm_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source integration_osm_config_env_var + fi +fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" @@ -511,7 +548,7 @@ fi #start oneagent -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" @@ -552,18 +589,56 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi echo "************end oneagent log routing checks************" +#If config parsing was successful, a copy of the conf file with replaced custom settings file is created +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + else + if [ -e "/opt/telegraf-test.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi + fi +else + if [ -e "/opt/telegraf-test-rs.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi +fi + #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then - if [ "$CONTAINER_RUNTIME" == "docker" ]; then - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else - echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" - sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + echo "starting fluent-bit and setting telegraf conf file for daemonset" + if [ "$CONTAINER_RUNTIME" == "docker" ]; then + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + else + echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" + sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi fi else + echo "starting fluent-bit and setting telegraf conf file for replicaset" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf -e /opt/td-agent-bit/bin/out_oms.so & telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index fe6c0565a..218e3c717 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -60,7 +60,13 @@ sudo apt-get install libcap2-bin -y #service telegraf stop -wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf +#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf + +#1.18 pre-release +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz +tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz + +mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ebf0257af..206d9a8f0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -443,6 +443,61 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 +#Only in sidecar scraping mode + - name: omsagent-prometheus + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 500m + memory: 400Mi + requests: + cpu: 75m + memory: 225Mi + env: + # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_RESOURCE_REGION_VALUE" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -502,6 +557,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: Deployment @@ -524,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -559,6 +618,9 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + # Add the below environment variable to true only in sidecar enabled regions, else set it to false + - name: SIDECAR_SCRAPING_ENABLED + value: "true" securityContext: privileged: true ports: @@ -586,6 +648,8 @@ spec: readOnly: true - mountPath: /etc/config/settings/adx name: omsagent-adx-secret + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config readOnly: true livenessProbe: exec: @@ -595,6 +659,7 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: # affinity to schedule on to ephemeral os node if its available @@ -658,6 +723,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: DaemonSet @@ -681,7 +750,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -691,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -711,10 +780,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP + - name: SIDECAR_SCRAPING_ENABLED + value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index d4f118449..e4ace417a 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod02232021 +ARG IMAGE_TAG=win-ciprod03262021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement @@ -47,6 +47,7 @@ RUN ./setup.ps1 COPY main.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell # copy ruby scripts to /opt folder COPY ./omsagentwindows/installer/scripts/*.rb /opt/omsagentwindows/scripts/ruby/ @@ -62,6 +63,9 @@ COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows +# copy telegraf conf file +COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ + # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 722392157..95cba2579 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -273,9 +273,9 @@ function Get-ContainerRuntime { return $containerRuntime } -function Start-Fluent { +function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service. + # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } @@ -289,35 +289,99 @@ function Start-Fluent { (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } + # Start telegraf only in sidecar scraping mode + $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') + { + Write-Host "Starting telegraf..." + Start-Telegraf + } + fluentd --reg-winsvc i --reg-winsvc-auto-start --winsvc-name fluentdwinaks --reg-winsvc-fluentdopt '-c C:/etc/fluent/fluent.conf -o C:/etc/fluent/fluent.log' Notepad.exe | Out-Null } -function Generate-Certificates { - Write-Host "Generating Certificates" - C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe -} +function Start-Telegraf { + # Set default telegraf environment variables for prometheus scraping + Write-Host "**********Setting default environment variables for telegraf prometheus plugin..." + .\setdefaulttelegrafenvvariables.ps1 + + # run prometheus custom config parser + Write-Host "**********Running config parser for custom prometheus scraping**********" + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-prom-customconfig.rb + Write-Host "**********End running config parser for custom prometheus scraping**********" + + + # Set required environment variable for telegraf prometheus plugin to run properly + Write-Host "Setting required environment variables for telegraf prometheus input plugin to run properly..." + $kubernetesServiceHost = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_HOST", "process") + if (![string]::IsNullOrEmpty($kubernetesServiceHost)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_HOST", $kubernetesServiceHost, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_HOST - $($kubernetesServiceHost) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_HOST for target 'machine' since it is either null or empty" + } + + $kubernetesServicePort = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_PORT", "process") + if (![string]::IsNullOrEmpty($kubernetesServicePort)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_PORT", $kubernetesServicePort, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_PORT - $($kubernetesServicePort) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" + } + + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } -function Bootstrap-CACertificates { + Write-Host "Installing telegraf service" + C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" + + # Setting delay auto start for telegraf since there have been known issues with windows server and telegraf - + # https://github.com/influxdata/telegraf/issues/4081 + # https://github.com/influxdata/telegraf/issues/3601 try { - # This is required when the root CA certs are different for some clouds. - $caCerts=Invoke-WebRequest 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing | ConvertFrom-Json - if (![string]::IsNullOrEmpty($caCerts)) { - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certificates[$index].CertBody > $name - Write-Host "name: $($name)" - Import-Certificate -FilePath .\$name -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose - } + $serverName = [System.Environment]::GetEnvironmentVariable("PODNAME", "process") + if (![string]::IsNullOrEmpty($serverName)) { + sc.exe \\$serverName config telegraf start= delayed-auto + Write-Host "Successfully set delayed start for telegraf" + + } else { + Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" } } catch { - $e = $_.Exception - Write-Host $e - Write-Host "exception occured in Bootstrap-CACertificates..." + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" } + Write-Host "Running telegraf service in test mode" + C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test + Write-Host "Starting telegraf service" + C:\opt\telegraf\telegraf.exe --service start + + # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup + Get-Service telegraf | findstr Running + if ($? -eq $false) + { + Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." + Start-Sleep -s 30 + C:\opt\telegraf\telegraf.exe --service start + Get-Service telegraf + } +} + +function Generate-Certificates { + Write-Host "Generating Certificates" + C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe } function Test-CertificatePath { @@ -346,16 +410,9 @@ Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher -#Bootstrapping CA certs for non public clouds and AKS clusters -$aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") -if (![string]::IsNullOrEmpty($aksResourceId) -and $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) -{ - Bootstrap-CACertificates -} - Generate-Certificates Test-CertificatePath -Start-Fluent +Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId diff --git a/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 new file mode 100644 index 000000000..269894139 --- /dev/null +++ b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 @@ -0,0 +1,17 @@ +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "machine") + diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index dd6d52a11..25aad5e16 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -8,10 +8,12 @@ Write-Host ('Creating folder structure') New-Item -Type Directory -Path /opt/fluent-bit New-Item -Type Directory -Path /opt/scripts/ruby + New-Item -Type Directory -Path /opt/telegraf New-Item -Type Directory -Path /etc/fluent-bit New-Item -Type Directory -Path /etc/fluent New-Item -Type Directory -Path /etc/omsagentwindows + New-Item -Type Directory -Path /etc/telegraf New-Item -Type Directory -Path /etc/config/settings/ New-Item -Type Directory -Path /etc/config/adx/ @@ -32,6 +34,20 @@ Write-Host ('Installing Fluent Bit'); } Write-Host ('Finished Installing Fluentbit') +Write-Host ('Installing Telegraf'); +try { + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip + Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf + Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue +} +catch { + $ex = $_.Exception + Write-Host "exception while downloading telegraf for windows" + Write-Host $ex + exit 1 +} +Write-Host ('Finished downloading Telegraf') Write-Host ('Installing Visual C++ Redistributable Package') $vcRedistLocation = 'https://aka.ms/vs/16/release/vc_redist.x64.exe' diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index b5e6e2d18..3bb56ac2a 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -21,7 +21,7 @@ function Install-Go { # install go lang Write-Host("installing go ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing go completed") Write-Host "updating PATH variable" @@ -102,7 +102,7 @@ function Install-DotNetCoreSDK() { # install dotNet core sdk Write-Host("installing .net core sdk 3.1 ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing .net core sdk 3.1 completed") } @@ -129,7 +129,7 @@ function Install-Docker() { # install docker Write-Host("installing docker for desktop ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing docker for desktop completed") } diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index d43a79f51..29b755331 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -127,7 +127,7 @@ remove_monitoring_tags() # validate cluster identity for Azure Arc enabled Kubernetes cluster if [ "$isArcK8sCluster" = true ] ; then - identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index d8ab7b345..ec57bf981 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.1" +$mcrChartVersion = "2.8.2" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 9d0c0aca5..9747d932d 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.1" +mcrChartVersion="2.8.2" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." @@ -311,7 +311,7 @@ parse_args() { validate_and_configure_supported_cloud() { echo "get active azure cloud name configured to azure cli" - azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") echo "active azure cloud name configured to azure cli: ${azureCloudName}" if [ "$isArcK8sCluster" = true ]; then if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then @@ -339,8 +339,8 @@ validate_cluster_identity() { local rgName="$(echo ${1})" local clusterName="$(echo ${2})" - local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) - identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) + identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"' | tr -d "[:space:]") echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then @@ -454,7 +454,7 @@ create_default_log_analytics_workspace() { echo "using existing default workspace:"$workspaceName fi - workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) + workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id -o json) workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') echo "workspace resource Id: ${workspaceResourceId}" } @@ -477,12 +477,12 @@ get_workspace_guid_and_key() { local wsName="$(echo ${resourceId} | cut -d'/' -f9)" # get the workspace guid - workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId) + workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId -o json) workspaceGuid=$(echo $workspaceGuid | tr -d '"') echo "workspaceGuid:"$workspaceGuid echo "getting workspace primaryshared key" - workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey) + workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) workspaceKey=$(echo $workspaceKey | tr -d '"') } @@ -621,7 +621,7 @@ else set_azure_subscription $workspaceSubscriptionId fi - workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location) + workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location -o json) workspaceRegion=$(echo $workspaceRegion | tr -d '"') echo "Workspace Region:"$workspaceRegion fi diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 6d14dfa5f..1cf7b5c97 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.1" +mcrChartVersion="2.8.2" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" @@ -202,7 +202,7 @@ validate_cluster_identity() { local rgName="$(echo ${1})" local clusterName="$(echo ${2})" - local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype @@ -216,7 +216,7 @@ validate_cluster_identity() { validate_monitoring_tags() { echo "get loganalyticsworkspaceResourceId tag on to cluster resource" - logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) + logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider -o json) echo "configured log analytics workspace: ${logAnalyticsWorkspaceResourceIdTag}" echo "successfully got logAnalyticsWorkspaceResourceId tag on the cluster resource" if [ -z "$logAnalyticsWorkspaceResourceIdTag" ]; then diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json index 8ebef232a..0069307b7 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -13,7 +13,7 @@ "metadata": { "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" } - }, + }, "proxyEndpointUrl": { "type": "string", "defaultValue": "", @@ -114,8 +114,7 @@ }, "configurationProtectedSettings": { "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", - "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" , - "omsagent.proxy": "[if(equals(parameters('proxyEndpointUrl'), ''), '', parameters('proxyEndpointUrl'))]" + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" }, "autoUpgradeMinorVersion": true, "releaseTrain": "Stable", diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index bda757eb8..0be806838 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1494,4 +1494,4 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } -} +} \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 3d30ac5aa..48f82a9ab 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -10,9 +10,9 @@ import ( "strings" "time" + "github.com/fluent/fluent-bit-go/output" "github.com/microsoft/ApplicationInsights-Go/appinsights" "github.com/microsoft/ApplicationInsights-Go/appinsights/contracts" - "github.com/fluent/fluent-bit-go/output" ) var ( @@ -44,33 +44,45 @@ var ( ContainerLogsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 - //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) + //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + OSMNamespaceCount int + //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPods string + //Tracks the number of monitor kubernetes pods namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsNamespaceLength int + //Tracks the number of monitor kubernetes pods label selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsLabelSelectorLength int + //Tracks the number of monitor kubernetes pods field selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsFieldSelectorLength int ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" - metricNameLogSize = "ContainerLogsSize" - metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" - metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" - metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" - metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" - metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" - metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" - metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" - metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameLogSize = "ContainerLogsSize" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" + metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" + metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" + metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" defaultTelemetryPushIntervalSeconds = 300 - eventNameContainerLogInit = "ContainerLogPluginInitialized" - eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameCustomPrometheusSidecarHeartbeat = "CustomPrometheusSidecarHeartbeatEvent" + eventNameWindowsFluentBitHeartbeat = "WindowsFluentBitHeartbeatEvent" ) // SendContainerLogPluginMetrics is a go-routine that flushes the data periodically (every 5 mins to App Insights) @@ -100,6 +112,11 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + osmNamespaceCount := OSMNamespaceCount + promMonitorPods := PromMonitorPods + promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength + promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength + promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 @@ -118,17 +135,39 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) - TelemetryClient.Track(logRateMetric) - Log("Log Size Rate: %f\n", logSizeRate) - TelemetryClient.Track(logSizeMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + telemetryDimensions := make(map[string]string) + telemetryDimensions["CustomPromMonitorPods"] = promMonitorPods + if promMonitorPodsNamespaceLength > 0 { + telemetryDimensions["CustomPromMonitorPodsNamespaceLength"] = strconv.Itoa(promMonitorPodsNamespaceLength) + } + if promMonitorPodsLabelSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsLabelSelectorLength"] = strconv.Itoa(promMonitorPodsLabelSelectorLength) + } + if promMonitorPodsFieldSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsFieldSelectorLength"] = strconv.Itoa(promMonitorPodsFieldSelectorLength) + } + if osmNamespaceCount > 0 { + telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) + } + + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) + + } else if strings.Compare(strings.ToLower(os.Getenv("OS_TYPE")), "windows") == 0 { + SendEvent(eventNameWindowsFluentBitHeartbeat, make(map[string]string)) + } else { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) + TelemetryClient.Track(logRateMetric) + Log("Log Size Rate: %f\n", logSizeRate) + TelemetryClient.Track(logSizeMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } } TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) if telegrafMetricsSendErrorCount > 0.0 { @@ -255,12 +294,60 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } if isProxyConfigured == true { - CommonProperties["IsProxyConfigured"] = "true" + CommonProperties["IsProxyConfigured"] = "true" } else { - CommonProperties["IsProxyConfigured"] = "false" - } + CommonProperties["IsProxyConfigured"] = "false" + } + + // Adding container type to telemetry + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + CommonProperties["ContainerType"] = "prometheussidecar" + } + } TelemetryClient.Context().CommonProperties = CommonProperties + + // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted + + OSMNamespaceCount = 0 + osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") + if osmNsCount != "" { + OSMNamespaceCount, err = strconv.Atoi(osmNsCount) + if err != nil { + Log("OSM namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") + + PromMonitorPodsNamespaceLength = 0 + promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") + if promMonPodsNamespaceLength != "" { + PromMonitorPodsNamespaceLength, err = strconv.Atoi(promMonPodsNamespaceLength) + if err != nil { + Log("Custom prometheus monitor kubernetes pods namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsLabelSelectorLength = 0 + promLabelSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH") + if promLabelSelectorLength != "" { + PromMonitorPodsLabelSelectorLength, err = strconv.Atoi(promLabelSelectorLength) + if err != nil { + Log("Custom prometheus label selector count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsFieldSelectorLength = 0 + promFieldSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH") + if promFieldSelectorLength != "" { + PromMonitorPodsFieldSelectorLength, err = strconv.Atoi(promFieldSelectorLength) + if err != nil { + Log("Custom prometheus field selector count string to int conversion error %s", err.Error()) + } + } + return 0, nil } diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index d3a96d37d..8cb6f603e 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -370,7 +370,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index 7824f3d4e..552dafb1f 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -26,6 +26,7 @@ def initialize @log.info "initialize start @ #{Time.now.utc.iso8601}" @token_expiry_time = Time.now @cached_access_token = String.new + @isLastTokenRenewalUpdatePending = false @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl @@ -41,14 +42,20 @@ def initialize def get_cluster_identity_token() begin - # get the cluster msi identity token either if its empty or near expirty. Token is valid 24 hrs. + # get the cluster msi identity token either if its empty or near expiry. Token is valid 24 hrs. if @cached_access_token.to_s.empty? || (Time.now + 60 * 60 > @token_expiry_time) # Refresh token 1 hr from expiration # renew the token if its near expiry if !@cached_access_token.to_s.empty? && (Time.now + 60 * 60 > @token_expiry_time) - @log.info "renewing the token since its near expiry @ #{Time.now.utc.iso8601}" - renew_near_expiry_token - # sleep 60 seconds to get the renewed token available - sleep 60 + if !@isLastTokenRenewalUpdatePending + @log.info "token expiry - @ #{@token_expiry_time}" + @log.info "renewing the token since token has near expiry @ #{Time.now.utc.iso8601}" + renew_near_expiry_token + # sleep 60 seconds to get the renewed token available + sleep 60 + @isLastTokenRenewalUpdatePending = true + else + @log.warn "last token renewal update still pending @ #{Time.now.utc.iso8601}" + end end @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" tokenReference = get_token_reference_from_crd @@ -61,6 +68,7 @@ def get_cluster_identity_token() token = get_token_from_secret(token_secret_name, token_secret_data_name) if !token.nil? @cached_access_token = token + @isLastTokenRenewalUpdatePending = false else @log.warn "got token nil from secret: #{@token_secret_name}" end @@ -123,7 +131,17 @@ def get_token_reference_from_crd() tokenReference["expirationTime"] = status["expirationTime"] tokenReference["secretName"] = status["tokenReference"]["secretName"] tokenReference["dataName"] = status["tokenReference"]["dataName"] - end + elsif get_response.code.to_i == 404 # this might happen if the crd resource deleted by user accidently + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + crd_request_body = get_crd_request_body + crd_request_body_json = crd_request_body.to_json + create_request = Net::HTTP::Post.new(crd_request_uri) + create_request["Content-Type"] = "application/json" + create_request["Authorization"] = "Bearer #{@service_account_token}" + create_request.body = crd_request_body_json + create_response = @http_client.request(create_request) + @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" + end rescue => err @log.warn "get_token_reference_from_crd call failed: #{err}" ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) @@ -141,20 +159,23 @@ def renew_near_expiry_token() cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, cluster_identity_resource_name: @@cluster_identity_resource_name, } - crd_request_body = get_crd_request_body - crd_request_body_json = crd_request_body.to_json - update_request = Net::HTTP::Patch.new(crd_request_uri) + update_crd_request_body = { 'status': {'expirationTime': ''} } + update_crd_request_body_json = update_crd_request_body.to_json + update_crd_request_uri = crd_request_uri + "/status" + update_request = Net::HTTP::Patch.new(update_crd_request_uri) update_request["Content-Type"] = "application/merge-patch+json" update_request["Authorization"] = "Bearer #{@service_account_token}" - update_request.body = crd_request_body_json + update_request.body = update_crd_request_body_json update_response = @http_client.request(update_request) - @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" + @log.info "Got response of #{update_response.code} for PATCH #{update_crd_request_uri} @ #{Time.now.utc.iso8601}" if update_response.code.to_i == 404 @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" create_request = Net::HTTP::Post.new(crd_request_uri) create_request["Content-Type"] = "application/json" create_request["Authorization"] = "Bearer #{@service_account_token}" - create_request.body = crd_request_body_json + create_crd_request_body = get_crd_request_body + create_crd_request_body_json = create_crd_request_body.to_json + create_request.body = create_crd_request_body_json create_response = @http_client.request(create_request) @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c803c0fa2..c057f7c2c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -19,7 +19,10 @@ class Kube_nodeInventory_Input < Input @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] + @@rsPromMonitorPodsLabelSelectorLength = ENV["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] + @@rsPromMonitorPodsFieldSelectorLength = ENV["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + @@osmNamespaceCount = ENV["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] def initialize super @@ -296,6 +299,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromUrl"] = @@rsPromUrlCount properties["rsPromMonPods"] = @@rsPromMonitorPods properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true