diff --git a/README.md b/README.md index 3564345ee..555234c61 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ The general directory structure is: │ │ ├── acrworkflows/ - acr work flows for the Linux Agent container image │ │ ├── defaultpromenvvariables - default environment variables for Prometheus scraping │ │ ├── defaultpromenvvariables-rs - cluster level default environment variables for Prometheus scraping +│ │ ├── defaultpromenvvariables-sidecar - cluster level default environment variables for Prometheus scraping in sidecar │ ├── windows/ - scripts to build the Docker image for Windows Agent │ │ ├── dockerbuild - script to build the code and docker imag, and publish docker image │ │ ├── acrworkflows/ - acr work flows for the Windows Agent container image diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..819c1956f --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,423 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end +# require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require "fileutils" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@promSideCar = "prometheussidecar" +@windows = "windows" +@configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +# @defaultRsMonitorPods = false +@defaultCustomPrometheusInterval = "1m" +@defaultCustomPrometheusFieldPass = [] +@defaultCustomPrometheusFieldDrop = [] +@defaultCustomPrometheusMonitorPods = false +@defaultCustomPrometheusLabelSelectors = "" +@defaultCustomPrometheusFieldSelectors = "" + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Checking to see if this is the daemonset or replicaset to parse config accordingly +@controller = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +def checkForType(variable, varType) + if variable.nil? || variable.kind_of?(varType) + return true + else + return false + end +end + +def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with no namespace filters" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", ("pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", ("kubernetes_label_selector = \"#{kubernetesLabelSelectors}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", ("kubernetes_field_selector = \"#{kubernetesFieldSelectors}\"")) + rescue => errorStr + puts "Exception while replacing default pod monitor settings for custom prometheus scraping: #{errorStr}" + end + return new_contents +end + +def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with namespace filters" + + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE") + + pluginConfigsWithNamespaces = "" + monitorKubernetesPodsNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + interval = \"#{interval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + kubernetes_label_selector = \"#{kubernetesLabelSelectors}\" + kubernetes_field_selector = \"#{kubernetesFieldSelectors}\" + fieldpass = #{fieldPassSetting} + fielddrop = #{fieldDropSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) + return new_contents + rescue => errorStr + puts "Exception while creating prometheus input plugins to filter namespaces for custom prometheus: #{errorStr}, using defaults" + replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + if !@controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if @controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + + # Remove below 4 lines after phased rollout + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datatypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + # Remove below check after phased rollout + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) # Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + # Remove below lines after phased rollout + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + end + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + end + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for replicaset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") + setRsPromDefaults + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && + ((!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) || + (!@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0) && @sidecarScrapingEnabled.strip.casecmp("true") == 0) && + !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus custom config settings for monitor kubernetes pods + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for custom prometheus scraping" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultCustomPrometheusInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultCustomPrometheusFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultCustomPrometheusFieldDrop : fieldDrop + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultCustomPrometheusMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + file_name = "/etc/telegraf/telegraf.conf" + else + file_name = "/opt/telegraf-test-prom-side-car.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf", file_name) + end + puts "config::Starting to substitute the placeholders in telegraf conf copy file for linux or conf file for windows for custom prometheus scraping" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" + #Set environment variables for telemetry in the sidecar container + if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for prometheus sidecar" + end + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for prometheus side car, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for promethues side car: #{errorStr}, using defaults") + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for daemonset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/build/linux/installer/conf/prometheus-side-car.conf b/build/linux/installer/conf/prometheus-side-car.conf new file mode 100644 index 000000000..fd40910d9 --- /dev/null +++ b/build/linux/installer/conf/prometheus-side-car.conf @@ -0,0 +1,4 @@ + + + + diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf new file mode 100644 index 000000000..720f54820 --- /dev/null +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -0,0 +1,28 @@ +[SERVICE] + #Default service flush interval is 15 seconds + Flush 15 + HTTP_Server Off + Daemon Off + storage.path /var/opt/microsoft/docker-cimprov/state/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 1m + Buffer_Size 1m + Mem_Buf_Limit 20m + +[OUTPUT] + Name oms + EnableTelemetry true + Retry_Limit 10 + TelemetryPushIntervalSeconds 300 + Match oms.container.* \ No newline at end of file diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf new file mode 100644 index 000000000..b3b4ba1d3 --- /dev/null +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 3000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 60000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index d81196330..ee1cf8819 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -540,13 +540,13 @@ #Prometheus Custom Metrics [[inputs.prometheus]] - interval = "$AZMON_RS_PROM_INTERVAL" + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = $AZMON_RS_PROM_URLS + urls = $AZMON_TELEGRAF_CUSTOM_PROM_URLS ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + kubernetes_services = $AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES ## Scrape Kubernetes pods for the following prometheus annotations: ## - prometheus.io/scrape: Enable scraping for this pod @@ -554,10 +554,15 @@ ## set this to `https` & most likely set the tls config. ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. ## - prometheus.io/port: If port is not 9102 use this annotation - $AZMON_RS_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS - fieldpass = $AZMON_RS_PROM_FIELDPASS - fielddrop = $AZMON_RS_PROM_FIELDDROP + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -581,7 +586,11 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS + # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index c680f0eea..df8fbc3da 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -110,24 +110,28 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlrb/string_utils.rb; source/toml-parser/tomlrb/string_utils.rb; 644; root; root /opt/tomlrb/version.rb; source/toml-parser/tomlrb/version.rb; 644; root; root -/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root -/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root -/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root +/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root +/etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf; build/linux/installer/conf/prometheus-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf; build/linux/installer/conf/telegraf-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root +/opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root +/opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index e3f9fb475..a82fa28eb 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -26,15 +26,22 @@ then exit 1 fi -if [ ! -s "inotifyoutput.txt" ] +if [ -s "inotifyoutput.txt" ] then - # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded - exit 0 -else - if [ -s "inotifyoutput.txt" ] - then - # inotifyoutput file has data(config map was applied) - echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log - exit 1 - fi + # inotifyoutput file has data(config map was applied) + echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log + exit 1 fi + +# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then + if [ -s "inotifyoutput-osm.txt" ] + then + # inotifyoutput-osm file has data(config map was applied) + echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log + exit 1 + fi +fi + +exit 0 diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb new file mode 100644 index 000000000..096064db8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -0,0 +1,168 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" +require "fileutils" +require_relative "ConfigParseErrorLogger" + +@controllerType = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +@replicaset = "replicaset" +@prometheusSidecar = "prometheussidecar" + +if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@replicaset) == 0 && + (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && !@sidecarScrapingEnabled.empty? && @sidecarScrapingEnabled.strip.casecmp("false") == 0)) + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + @tgfTestConfigFile = "/opt/telegraf-test-rs.conf" +elsif !@containerType.nil? && !@containerType.empty? && @containerType.strip.casecmp(@prometheusSidecar) == 0 + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +end + +@configMapMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" +@configSchemaVersion = "" +# @tgfConfigFileSidecar = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" +# @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +@osmMetricNamespaces = [] + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +#@fieldPassSetting = "[\"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq\"]" +@fieldPassSetting = "[\"envoy_cluster_upstream_cx_total\", \"envoy_cluster_upstream_cx_connect_fail\", \"envoy_cluster_upstream_rq\", \"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq_total\", \"envoy_cluster_upstream_rq_time_bucket\", \"envoy_cluster_upstream_cx_rx_bytes_total\", \"envoy_cluster_upstream_cx_tx_bytes_total\", \"envoy_cluster_upstream_cx_active\"]" +@scrapeInterval = "1m" +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-osmconfig for osm metrics found, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map for osm metrics" + return parsedConfig + else + puts "config::configmap container-azm-ms-osmconfig for osm metrics not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for osm metrics: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && + !parsedConfig[:osm_metric_collection_configuration].nil? && + !parsedConfig[:osm_metric_collection_configuration][:settings].nil? + osmPromMetricNamespaces = parsedConfig[:osm_metric_collection_configuration][:settings][:monitor_namespaces] + puts "config::osm::got:osm_metric_collection_configuration.settings.monitor_namespaces='#{osmPromMetricNamespaces}'" + + # Check to see if osm_metric_collection_configuration.settings has a valid setting for monitor_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + if !osmPromMetricNamespaces.nil? && checkForTypeArray(osmPromMetricNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (osmPromMetricNamespaces.length > 0) + @osmMetricNamespaces = osmPromMetricNamespaces + end + end + end + rescue => errorStr + puts "config::osm::error:Exception while reading config settings for osm configuration settings - #{errorStr}, using defaults" + @osmMetricNamespaces = [] + end +end + +def replaceOsmTelegrafConfigPlaceHolders + begin + #replace place holders in configuration file + tgfConfig = File.read(@tgfTestConfigFile) #read returns only after closing the file + + if @osmMetricNamespaces.length > 0 + osmPluginConfigsWithNamespaces = "" + @osmMetricNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + osmPluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + name_prefix=\"container.azm.ms.osm/\" + interval = \"#{@scrapeInterval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controllerType.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + fieldpass = #{@fieldPassSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", osmPluginConfigsWithNamespaces) + else + puts "Using defaults for OSM configuration since there was an error in OSM config map or no namespaces were set" + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", "") + end + File.open(@tgfTestConfigFile, "w") { |file| file.puts tgfConfig } # 'file' will be closed here after it goes out of scope + puts "config::osm::Successfully substituted the OSM placeholders in #{@tgfTestConfigFile} file in sidecar container" + rescue => errorStr + # TODO: test this scenario out + puts "config::osm::error:Exception while replacing telegraf configuration settings for osm - #{errorStr}, using defaults" + end +end + +@osmConfigSchemaVersion = ENV["AZMON_OSM_CFG_SCHEMA_VERSION"] +puts "****************Start OSM Config Processing********************" +if !@osmConfigSchemaVersion.nil? && !@osmConfigSchemaVersion.empty? && @osmConfigSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + # Check to see if the prometheus custom config parser has created a test config file so that we can replace the settings in the test file and run it, If not create + # a test config file by copying contents of the actual telegraf config file. + if (!File.exist?(@tgfTestConfigFile)) + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + puts "test telegraf config file #{@tgfTestConfigFile} does not exist, creating new one" + FileUtils.cp(@tgfConfigFile, @tgfTestConfigFile) + end + + replaceOsmTelegrafConfigPlaceHolders() + + # Write the telemetry to file, so that they can be set as environment variables + telemetryFile = File.open("integration_osm_config_env_var", "w") + + if !telemetryFile.nil? + telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + # Close file after writing all environment variables + telemetryFile.close + else + puts "config::osm::Exception while opening file for writing OSM telemetry environment variables" + end + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::osm::unsupported/missing config schema version - '#{@osmConfigSchemaVersion}' , using defaults, please use supported schema version") + else + puts "config::No configmap mounted for OSM config, using defaults" + end +end +puts "****************End OSM Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb b/build/linux/installer/scripts/tomlparser-prom-customconfig.rb deleted file mode 100644 index 7aad580ee..000000000 --- a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/local/bin/ruby - -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" -require "fileutils" - -@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" -@replicaset = "replicaset" -@daemonset = "daemonset" -@configSchemaVersion = "" -@defaultDsInterval = "1m" -@defaultDsPromUrls = [] -@defaultDsFieldPass = [] -@defaultDsFieldDrop = [] -@defaultRsInterval = "1m" -@defaultRsPromUrls = [] -@defaultRsFieldPass = [] -@defaultRsFieldDrop = [] -@defaultRsK8sServices = [] -@defaultRsMonitorPods = false - -#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering -@metricVersion = 2 -@urlTag = "scrapeUrl" -@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" -@responseTimeout = "15s" -@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" -@insecureSkipVerify = true - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@promConfigMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" - parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted prometheus config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -def checkForTypeArray(arrayValue, arrayType) - if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) - return true - else - return false - end -end - -def checkForType(variable, varType) - if variable.nil? || variable.kind_of?(varType) - return true - else - return false - end -end - -def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") - rescue => errorStr - puts "Exception while replacing default pod monitor settings: #{errorStr}" - end - return new_contents -end - -def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_RS_PROM_MONITOR_PODS") - pluginConfigsWithNamespaces = "" - monitorKubernetesPodsNamespaces.each do |namespace| - if !namespace.nil? - #Stripping namespaces to remove leading and trailing whitespaces - namespace.strip! - if namespace.length > 0 - pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] - interval = \"#{interval}\" - monitor_kubernetes_pods = true - monitor_kubernetes_pods_namespace = \"#{namespace}\" - fieldpass = #{fieldPassSetting} - fielddrop = #{fieldDropSetting} - metric_version = #{@metricVersion} - url_tag = \"#{@urlTag}\" - bearer_token = \"#{@bearerToken}\" - response_timeout = \"#{@responseTimeout}\" - tls_ca = \"#{@tlsCa}\" - insecure_skip_verify = #{@insecureSkipVerify}\n" - end - end - end - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) - return new_contents - rescue => errorStr - puts "Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" - replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - # Checking to see if this is the daemonset or replicaset to parse config accordingly - controller = ENV["CONTROLLER_TYPE"] - if !controller.nil? - if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? - if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? - #Get prometheus replicaset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] - kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] - monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] - monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(kubernetesServices, String) && - checkForTypeArray(urls, String) && - (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby - puts "config::Successfully passed typecheck for config settings for replicaset" - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultRsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop - kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices - urls = (urls.nil?) ? @defaultRsPromUrls : urls - monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods - - file_name = "/opt/telegraf-test-rs.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPassSetting) - fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDropSetting) - new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) - - # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces - # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - - # - to use defaults in case of nil settings - if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) - new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length - else - new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - monitorKubernetesPodsNamespacesLength = 0 - end - - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") - file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") - - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for replicaset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") - setRsPromDefaults - puts "****************End Prometheus Config Processing********************" - end - elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? - #Get prometheus daemonset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(urls, String) - puts "config::Successfully passed typecheck for config settings for daemonset" - - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultDsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop - urls = (urls.nil?) ? @defaultDsPromUrls : urls - - file_name = "/opt/telegraf-test.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" - - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for daemonset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") - puts "****************End Prometheus Config Processing********************" - end - end # end of controller type check - end - else - ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Prometheus Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@promConfigMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") - else - puts "config::No configmap mounted for prometheus custom config, using defaults" - end -end -puts "****************End Prometheus Config Processing********************" diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 879ee4810..1eebe5fd6 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -12,6 +12,15 @@ Chunk_Size 32 Buffer_Size 64 +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m + [OUTPUT] Name oms EnableTelemetry true diff --git a/build/windows/installer/conf/telegraf.conf b/build/windows/installer/conf/telegraf.conf new file mode 100644 index 000000000..5f4d2364e --- /dev/null +++ b/build/windows/installer/conf/telegraf.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 82d210f3d..8868b86bb 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -81,6 +81,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SIDECAR_SCRAPING_ENABLED + value: "false" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 37b8faacc..9b6656e9c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -72,7 +72,9 @@ spec: value: {{ .Values.Azure.Extension.Name | quote }} {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" + - name: SIDECAR_SCRAPING_ENABLED + value: "false" - name: ISTEST value: {{ .Values.omsagent.ISTEST | quote }} securityContext: @@ -109,6 +111,9 @@ spec: - mountPath: /etc/config/settings/adx name: omsagent-adx-secret readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true livenessProbe: exec: command: @@ -157,5 +162,9 @@ spec: - name: omsagent-adx-secret secret: secretName: omsagent-adx-secret - optional: true + optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true {{- end }} diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index aec1bb456..e38d9b4ab 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -76,6 +76,17 @@ data: ## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"] # monitor_kubernetes_pods_namespaces = ["default1"] + ## Label selector to target pods which have the specified label + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + # kubernetes_label_selector = "env=dev,app=nginx" + + ## Field selector to target pods which have the specified field + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + ## eg. To scrape pods on a specific node + # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" + [prometheus_data_collection_settings.node] # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. diff --git a/kubernetes/container-azm-ms-osmconfig.yaml b/kubernetes/container-azm-ms-osmconfig.yaml new file mode 100644 index 000000000..05b7ac3ed --- /dev/null +++ b/kubernetes/container-azm-ms-osmconfig.yaml @@ -0,0 +1,17 @@ +kind: ConfigMap +apiVersion: v1 +data: + schema-version: + #string.used by agent to parse OSM config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by OSM addon team to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + osm-metric-collection-configuration: |- + # OSM metric collection settings + [osm_metric_collection_configuration.settings] + # Namespaces to monitor + # monitor_namespaces = ["namespace1", "namespace2"] +metadata: + name: container-azm-ms-osmconfig + namespace: kube-system diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index bee718a31..bcdc31330 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image diff --git a/kubernetes/linux/defaultpromenvvariables-rs b/kubernetes/linux/defaultpromenvvariables-rs index 1346e62b9..920f4e90e 100644 --- a/kubernetes/linux/defaultpromenvvariables-rs +++ b/kubernetes/linux/defaultpromenvvariables-rs @@ -1,7 +1,12 @@ -export AZMON_RS_PROM_INTERVAL="1m" -export AZMON_RS_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" -export AZMON_RS_PROM_FIELDPASS="[]" -export AZMON_RS_PROM_FIELDDROP="[]" -export AZMON_RS_PROM_URLS="[]" -export AZMON_RS_PROM_K8S_SERVICES="[]" -export AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'cluster'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_URLS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" + diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar new file mode 100644 index 000000000..3301488d8 --- /dev/null +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -0,0 +1,9 @@ +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'node'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c4067f25e..71e46875b 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -2,7 +2,17 @@ if [ -e "/etc/config/kube.conf" ]; then cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf +elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "setting omsagent conf file for prometheus sidecar" + cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. + # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, + # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. + # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. + sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf + sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf else + echo "setting omsagent conf file for daemonset" sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf fi sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf @@ -28,6 +38,12 @@ sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +#Run inotify as a daemon to track changes to the mounted configmap for OSM settings. +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +fi + #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" @@ -68,6 +84,24 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi +#set OSM config schema version +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then + #trim + osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" + #remove all spaces + osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" + #take first 10 characters + osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + + export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + source ~/.bashrc + echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" + fi +fi + export PROXY_ENDPOINT="" # Check for internet connectivity or workspace deletion @@ -193,71 +227,58 @@ echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc source ~/.bashrc +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + #Parse the configmap to set the right environment variables. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb -#Parse the configmap to set the right environment variables. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb - -cat config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source config_env_var - + cat config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_env_var +fi #Parse the configmap to set the right environment variables for agent config. #Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb -cat agent_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source agent_config_env_var + cat agent_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source agent_config_env_var -#Parse the configmap to set the right environment variables for network policy manager (npm) integration. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + #Parse the configmap to set the right environment variables for network policy manager (npm) integration. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb -cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source integration_npm_config_env_var + cat integration_npm_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source integration_npm_config_env_var +fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb -#If config parsing was successful, a copy of the conf file with replaced custom settings file is created -if [ ! -e "/etc/config/kube.conf" ]; then - if [ -e "/opt/telegraf-test.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -else - if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -fi - #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then - cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc - done - source defaultpromenvvariables + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + cat defaultpromenvvariables-sidecar | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables-sidecar + else + cat defaultpromenvvariables | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables + fi else cat defaultpromenvvariables-rs | while read line; do echo $line >> ~/.bashrc @@ -273,21 +294,37 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb -cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_mdm_metrics_env_var + cat config_mdm_metrics_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_mdm_metrics_env_var -#Parse the configmap to set the right environment variables for metric collection settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + #Parse the configmap to set the right environment variables for metric collection settings + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb -cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_metric_collection_env_var + cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_metric_collection_env_var +fi + +# OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + + if [ -e "integration_osm_config_env_var" ]; then + cat integration_osm_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source integration_osm_config_env_var + fi +fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" @@ -511,7 +548,7 @@ fi #start oneagent -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" @@ -552,18 +589,56 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi echo "************end oneagent log routing checks************" +#If config parsing was successful, a copy of the conf file with replaced custom settings file is created +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + else + if [ -e "/opt/telegraf-test.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi + fi +else + if [ -e "/opt/telegraf-test-rs.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi +fi + #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then - if [ "$CONTAINER_RUNTIME" == "docker" ]; then - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else - echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" - sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + echo "starting fluent-bit and setting telegraf conf file for daemonset" + if [ "$CONTAINER_RUNTIME" == "docker" ]; then + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + else + echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" + sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi fi else + echo "starting fluent-bit and setting telegraf conf file for replicaset" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf -e /opt/td-agent-bit/bin/out_oms.so & telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index fe6c0565a..218e3c717 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -60,7 +60,13 @@ sudo apt-get install libcap2-bin -y #service telegraf stop -wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf +#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf + +#1.18 pre-release +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz +tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz + +mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ebf0257af..c25b9bfd4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -443,6 +443,59 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 +#Only in sidecar scraping mode + - name: omsagent-prometheus + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 500m + memory: 400Mi + requests: + cpu: 75m + memory: 225Mi + env: + # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_RESOURCE_REGION_VALUE" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -502,6 +555,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: Deployment @@ -559,6 +616,9 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + # Add the below environment variable to true only in sidecar enabled regions, else set it to false + - name: SIDECAR_SCRAPING_ENABLED + value: "true" securityContext: privileged: true ports: @@ -586,6 +646,8 @@ spec: readOnly: true - mountPath: /etc/config/settings/adx name: omsagent-adx-secret + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config readOnly: true livenessProbe: exec: @@ -658,6 +720,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: DaemonSet @@ -711,10 +777,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP + - name: SIDECAR_SCRAPING_ENABLED + value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index d4f118449..c0bebcc93 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -47,6 +47,7 @@ RUN ./setup.ps1 COPY main.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell # copy ruby scripts to /opt folder COPY ./omsagentwindows/installer/scripts/*.rb /opt/omsagentwindows/scripts/ruby/ @@ -62,6 +63,9 @@ COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows +# copy telegraf conf file +COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ + # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 722392157..95cba2579 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -273,9 +273,9 @@ function Get-ContainerRuntime { return $containerRuntime } -function Start-Fluent { +function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service. + # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } @@ -289,35 +289,99 @@ function Start-Fluent { (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } + # Start telegraf only in sidecar scraping mode + $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') + { + Write-Host "Starting telegraf..." + Start-Telegraf + } + fluentd --reg-winsvc i --reg-winsvc-auto-start --winsvc-name fluentdwinaks --reg-winsvc-fluentdopt '-c C:/etc/fluent/fluent.conf -o C:/etc/fluent/fluent.log' Notepad.exe | Out-Null } -function Generate-Certificates { - Write-Host "Generating Certificates" - C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe -} +function Start-Telegraf { + # Set default telegraf environment variables for prometheus scraping + Write-Host "**********Setting default environment variables for telegraf prometheus plugin..." + .\setdefaulttelegrafenvvariables.ps1 + + # run prometheus custom config parser + Write-Host "**********Running config parser for custom prometheus scraping**********" + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-prom-customconfig.rb + Write-Host "**********End running config parser for custom prometheus scraping**********" + + + # Set required environment variable for telegraf prometheus plugin to run properly + Write-Host "Setting required environment variables for telegraf prometheus input plugin to run properly..." + $kubernetesServiceHost = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_HOST", "process") + if (![string]::IsNullOrEmpty($kubernetesServiceHost)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_HOST", $kubernetesServiceHost, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_HOST - $($kubernetesServiceHost) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_HOST for target 'machine' since it is either null or empty" + } + + $kubernetesServicePort = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_PORT", "process") + if (![string]::IsNullOrEmpty($kubernetesServicePort)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_PORT", $kubernetesServicePort, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_PORT - $($kubernetesServicePort) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" + } + + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } -function Bootstrap-CACertificates { + Write-Host "Installing telegraf service" + C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" + + # Setting delay auto start for telegraf since there have been known issues with windows server and telegraf - + # https://github.com/influxdata/telegraf/issues/4081 + # https://github.com/influxdata/telegraf/issues/3601 try { - # This is required when the root CA certs are different for some clouds. - $caCerts=Invoke-WebRequest 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing | ConvertFrom-Json - if (![string]::IsNullOrEmpty($caCerts)) { - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certificates[$index].CertBody > $name - Write-Host "name: $($name)" - Import-Certificate -FilePath .\$name -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose - } + $serverName = [System.Environment]::GetEnvironmentVariable("PODNAME", "process") + if (![string]::IsNullOrEmpty($serverName)) { + sc.exe \\$serverName config telegraf start= delayed-auto + Write-Host "Successfully set delayed start for telegraf" + + } else { + Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" } } catch { - $e = $_.Exception - Write-Host $e - Write-Host "exception occured in Bootstrap-CACertificates..." + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" } + Write-Host "Running telegraf service in test mode" + C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test + Write-Host "Starting telegraf service" + C:\opt\telegraf\telegraf.exe --service start + + # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup + Get-Service telegraf | findstr Running + if ($? -eq $false) + { + Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." + Start-Sleep -s 30 + C:\opt\telegraf\telegraf.exe --service start + Get-Service telegraf + } +} + +function Generate-Certificates { + Write-Host "Generating Certificates" + C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe } function Test-CertificatePath { @@ -346,16 +410,9 @@ Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher -#Bootstrapping CA certs for non public clouds and AKS clusters -$aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") -if (![string]::IsNullOrEmpty($aksResourceId) -and $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) -{ - Bootstrap-CACertificates -} - Generate-Certificates Test-CertificatePath -Start-Fluent +Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId diff --git a/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 new file mode 100644 index 000000000..269894139 --- /dev/null +++ b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 @@ -0,0 +1,17 @@ +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "machine") + diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index dd6d52a11..25aad5e16 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -8,10 +8,12 @@ Write-Host ('Creating folder structure') New-Item -Type Directory -Path /opt/fluent-bit New-Item -Type Directory -Path /opt/scripts/ruby + New-Item -Type Directory -Path /opt/telegraf New-Item -Type Directory -Path /etc/fluent-bit New-Item -Type Directory -Path /etc/fluent New-Item -Type Directory -Path /etc/omsagentwindows + New-Item -Type Directory -Path /etc/telegraf New-Item -Type Directory -Path /etc/config/settings/ New-Item -Type Directory -Path /etc/config/adx/ @@ -32,6 +34,20 @@ Write-Host ('Installing Fluent Bit'); } Write-Host ('Finished Installing Fluentbit') +Write-Host ('Installing Telegraf'); +try { + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip + Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf + Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue +} +catch { + $ex = $_.Exception + Write-Host "exception while downloading telegraf for windows" + Write-Host $ex + exit 1 +} +Write-Host ('Finished downloading Telegraf') Write-Host ('Installing Visual C++ Redistributable Package') $vcRedistLocation = 'https://aka.ms/vs/16/release/vc_redist.x64.exe' diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index b5e6e2d18..3bb56ac2a 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -21,7 +21,7 @@ function Install-Go { # install go lang Write-Host("installing go ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing go completed") Write-Host "updating PATH variable" @@ -102,7 +102,7 @@ function Install-DotNetCoreSDK() { # install dotNet core sdk Write-Host("installing .net core sdk 3.1 ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing .net core sdk 3.1 completed") } @@ -129,7 +129,7 @@ function Install-Docker() { # install docker Write-Host("installing docker for desktop ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing docker for desktop completed") } diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 0bd983297..d35acad3d 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1491,4 +1491,4 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } -} +} \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 3d30ac5aa..48f82a9ab 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -10,9 +10,9 @@ import ( "strings" "time" + "github.com/fluent/fluent-bit-go/output" "github.com/microsoft/ApplicationInsights-Go/appinsights" "github.com/microsoft/ApplicationInsights-Go/appinsights/contracts" - "github.com/fluent/fluent-bit-go/output" ) var ( @@ -44,33 +44,45 @@ var ( ContainerLogsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 - //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) + //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + OSMNamespaceCount int + //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPods string + //Tracks the number of monitor kubernetes pods namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsNamespaceLength int + //Tracks the number of monitor kubernetes pods label selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsLabelSelectorLength int + //Tracks the number of monitor kubernetes pods field selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsFieldSelectorLength int ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" - metricNameLogSize = "ContainerLogsSize" - metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" - metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" - metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" - metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" - metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" - metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" - metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" - metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameLogSize = "ContainerLogsSize" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" + metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" + metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" + metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" defaultTelemetryPushIntervalSeconds = 300 - eventNameContainerLogInit = "ContainerLogPluginInitialized" - eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameCustomPrometheusSidecarHeartbeat = "CustomPrometheusSidecarHeartbeatEvent" + eventNameWindowsFluentBitHeartbeat = "WindowsFluentBitHeartbeatEvent" ) // SendContainerLogPluginMetrics is a go-routine that flushes the data periodically (every 5 mins to App Insights) @@ -100,6 +112,11 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + osmNamespaceCount := OSMNamespaceCount + promMonitorPods := PromMonitorPods + promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength + promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength + promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 @@ -118,17 +135,39 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) - TelemetryClient.Track(logRateMetric) - Log("Log Size Rate: %f\n", logSizeRate) - TelemetryClient.Track(logSizeMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + telemetryDimensions := make(map[string]string) + telemetryDimensions["CustomPromMonitorPods"] = promMonitorPods + if promMonitorPodsNamespaceLength > 0 { + telemetryDimensions["CustomPromMonitorPodsNamespaceLength"] = strconv.Itoa(promMonitorPodsNamespaceLength) + } + if promMonitorPodsLabelSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsLabelSelectorLength"] = strconv.Itoa(promMonitorPodsLabelSelectorLength) + } + if promMonitorPodsFieldSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsFieldSelectorLength"] = strconv.Itoa(promMonitorPodsFieldSelectorLength) + } + if osmNamespaceCount > 0 { + telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) + } + + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) + + } else if strings.Compare(strings.ToLower(os.Getenv("OS_TYPE")), "windows") == 0 { + SendEvent(eventNameWindowsFluentBitHeartbeat, make(map[string]string)) + } else { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) + TelemetryClient.Track(logRateMetric) + Log("Log Size Rate: %f\n", logSizeRate) + TelemetryClient.Track(logSizeMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } } TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) if telegrafMetricsSendErrorCount > 0.0 { @@ -255,12 +294,60 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } if isProxyConfigured == true { - CommonProperties["IsProxyConfigured"] = "true" + CommonProperties["IsProxyConfigured"] = "true" } else { - CommonProperties["IsProxyConfigured"] = "false" - } + CommonProperties["IsProxyConfigured"] = "false" + } + + // Adding container type to telemetry + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + CommonProperties["ContainerType"] = "prometheussidecar" + } + } TelemetryClient.Context().CommonProperties = CommonProperties + + // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted + + OSMNamespaceCount = 0 + osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") + if osmNsCount != "" { + OSMNamespaceCount, err = strconv.Atoi(osmNsCount) + if err != nil { + Log("OSM namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") + + PromMonitorPodsNamespaceLength = 0 + promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") + if promMonPodsNamespaceLength != "" { + PromMonitorPodsNamespaceLength, err = strconv.Atoi(promMonPodsNamespaceLength) + if err != nil { + Log("Custom prometheus monitor kubernetes pods namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsLabelSelectorLength = 0 + promLabelSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH") + if promLabelSelectorLength != "" { + PromMonitorPodsLabelSelectorLength, err = strconv.Atoi(promLabelSelectorLength) + if err != nil { + Log("Custom prometheus label selector count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsFieldSelectorLength = 0 + promFieldSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH") + if promFieldSelectorLength != "" { + PromMonitorPodsFieldSelectorLength, err = strconv.Atoi(promFieldSelectorLength) + if err != nil { + Log("Custom prometheus field selector count string to int conversion error %s", err.Error()) + } + } + return 0, nil } diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c803c0fa2..c057f7c2c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -19,7 +19,10 @@ class Kube_nodeInventory_Input < Input @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] + @@rsPromMonitorPodsLabelSelectorLength = ENV["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] + @@rsPromMonitorPodsFieldSelectorLength = ENV["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + @@osmNamespaceCount = ENV["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] def initialize super @@ -296,6 +299,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromUrl"] = @@rsPromUrlCount properties["rsPromMonPods"] = @@rsPromMonitorPods properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true