From 04826d02910e448937d72aace521113bc7bcf438 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 18 Aug 2020 15:57:45 -0700 Subject: [PATCH 01/36] Add in pv metrics from cadvisor --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 42ecfcaf0..c685ebff9 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -293,6 +293,10 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + + metricNamesToCollect = ["availableBytes", "capacityBytes", "usedBytes", "inodes", "inodesUsed", "inodesFree"] + metricNamesToReturn = ["PVAvailableBytes", "PVCapacityBytes", "PVUsedBytes", "PVinodes", "PVinodesUsed", "PVinodesFree"] + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, metricNamesToCollect, metricNamesToReturn, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -303,6 +307,73 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end + def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricNamesToReturn, metricPollTime) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + containerNames = [] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + containerNames.push(podUid + "/" + containerName) + + if (!pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + time = volume["time"] + + metricCount = 0 + metricNamesToCollect.each do |metricNameToCollect| + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + metricTags["PVName"] = pvName + metricTags["PVCName"] = pvcName + metricTags["Time"] = time + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + + metricCount = metricCount + 1 + end + end + end + end + end + end + end + end + rescue => errorStr + @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + return metricItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId From a459794f03130ee37a4f5a2db8b330be0e76b471 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 20 Aug 2020 15:14:41 -0700 Subject: [PATCH 02/36] changed to send only pv usage & add kube-system toggle config --- build/common/installer/scripts/tomlparser.rb | 14 +++++ kubernetes/container-azm-ms-agentconfig.yaml | 6 +++ .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 51 ++++++++----------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 7235ee0c3..51c0d7b13 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -24,6 +24,7 @@ @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false @collectAllKubeEvents = false +@collectPVKubeSystemMetrics = false @containerLogsRoute = "" # Use parser to parse the configmap toml file to a ruby structure @@ -148,6 +149,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") end + #Get PV kube-system enrichment setting + begin + if !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") + end + #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? @@ -199,6 +210,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") # Close file after writing all environment variables file.close @@ -244,6 +256,8 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) + commands = get_command_windows('export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS', @collectPVKubeSystemMetrics) + file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58e09f041..c4b300e9d 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,6 +42,12 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected + [log_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected + enabled = false + # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected + prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings [prometheus_data_collection_settings.cluster] diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 77997feca..4efd9092d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -20,6 +20,7 @@ class CAdvisorMetricsAPIClient @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @@ -302,9 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricNamesToCollect = ["availableBytes", "capacityBytes", "usedBytes", "inodes", "inodesUsed", "inodesFree"] - metricNamesToReturn = ["PVAvailableBytes", "PVCapacityBytes", "PVUsedBytes", "PVinodes", "PVinodesUsed", "PVinodesFree"] - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, metricNamesToCollect, metricNamesToReturn, metricTime)) + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", "pv_used_bytes", metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -327,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, podNamespace = pod["podRef"]["namespace"] containerNames = [] - if (!pod["containers"].nil?) + if ((!podNamespace == "kube-system" || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) pod["containers"].each do |container| containerName = container["name"] containerNames.push(podUid + "/" + containerName) @@ -341,32 +340,26 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, # A PVC exists on this volume pvcName = pvcRef["name"] pvName = volume["name"] - time = volume["time"] - - metricCount = 0 - metricNamesToCollect.each do |metricNameToCollect| - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNamesToReturn[metricCount] - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = podNameSpace - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags["PVName"] = pvName - metricTags["PVCName"] = pvcName - metricTags["Time"] = time - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + metricTags["pvName"] = pvName + metricTags["pvcName"] = pvcName + metricTags["pv_capacity_bytes"] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - - metricCount = metricCount + 1 - end + metricItems.push(metricItem) end end end @@ -375,7 +368,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, end end rescue => errorStr - @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end return metricItems From 0ec8ef9052aec7609fe172fe28cd4bbdf4405b9a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 21 Aug 2020 09:09:56 -0700 Subject: [PATCH 03/36] variable name fixes --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 4efd9092d..55d8ad55d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -314,7 +314,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end - def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricNamesToReturn, metricPollTime) + def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -326,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, podNamespace = pod["podRef"]["namespace"] containerNames = [] - if ((!podNamespace == "kube-system" || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) + if ((!(podNamespace == "kube-system") || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) pod["containers"].each do |container| containerName = container["name"] containerNames.push(podUid + "/" + containerName) @@ -344,7 +344,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricItem = {} metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName - metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = podNameSpace From fb8a2147820b2fdb1c2008135c69d02b2c740b1e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 09:42:01 -0700 Subject: [PATCH 04/36] Added kube-system config --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 55d8ad55d..228ff6ad9 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -315,6 +315,14 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + @Log.info("Getting PV metrics") + pvKubeSystem = @pvKubeSystemCollectionMetricsEnabled.nil? ? "pv kube-system nil" : "pv kube-system not nil" + @Log.info(pvKubeSystem) + @Log.info(@pvKubeSystemCollectionMetricsEnabled) + + pvKubeSystemCollectionMetrics = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] + @Log.info(pvKubeSystemCollectionMetrics) + metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -325,11 +333,22 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podName = pod["podRef"]["name"] podNamespace = pod["podRef"]["namespace"] - containerNames = [] - if ((!(podNamespace == "kube-system") || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) + kubeSystemNamespace = false + if (podNamespace.include? "kube-system") + @Log.info("kube-system namespace encountered") + if (pvKubeSystemCollectionMetrics == "true") + kubeSystemNamespace = false + @Log.info("kube-system namespace encountered - include") + else + kubeSystemNamespace = true + @Log.info("kube-system namespace encountered - exclude") + end + end + + + if (!pod["containers"].nil? && !kubeSystemNamespace) pod["containers"].each do |container| containerName = container["name"] - containerNames.push(podUid + "/" + containerName) if (!pod["volume"].nil?) pod["volume"].each do |volume| @@ -347,7 +366,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = podNameSpace + metricItem["Namespace"] = "container.azm.ms/pv" metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId @@ -356,6 +375,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricTags["pvName"] = pvName metricTags["pvcName"] = pvcName metricTags["pv_capacity_bytes"] = volume["capacityBytes"] + metricTags["podNamespace"] = podNamespace metricItem["Tags"] = metricTags From 0b2f9dc9eb9c68a4a1f6f2c3822f098ab882b4da Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 13:47:17 -0700 Subject: [PATCH 05/36] mdm filter --- .../scripts/tomlparser-mdm-metrics-config.rb | 12 ++++++ source/plugins/ruby/MdmMetricsGenerator.rb | 8 ++++ source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 40 ++++++++++++++++++- 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 1c01dd8c6..d2990ca0c 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -12,6 +12,7 @@ @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD +@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -66,6 +67,15 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end + #PV + pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end puts "config::Using config map settings for MDM metric configuration settings for resource utilization" end rescue => errorStr @@ -73,6 +83,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end end end @@ -97,6 +108,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3d75dc6f4..9c0873602 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -356,6 +356,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + metric_threshold_hash["pvUsage"] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -375,6 +376,13 @@ def getContainerResourceUtilizationThresholds memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2) metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + + pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] + @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" + if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? + pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) + metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = pvUsagePercentageThresholdFloat + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index dd1ba24b3..91dfc6077 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -59,6 +59,7 @@ class Constants DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 80.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index fd43ef98b..9fb0af5a2 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,pv_used_bytes" @@hostName = (OMS::Common.get_hostname) @@ -51,15 +51,18 @@ def start @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false # initialize cpu and memory limit if @process_incoming_stream @cpu_capacity = 0.0 @memory_capacity = 0.0 + @pv_capacity = 0.0 ensure_cpu_memory_capacity_set @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} + @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds end rescue => e @@ -87,6 +90,8 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true + elsif metricName == "pv_used_bytes" + @pvExceededUsageThreshold = true end rescue => errorStr @log.info "Error in setThresholdExceededTelemetry: #{errorStr}" @@ -104,10 +109,12 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] + properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash["pv_used_bytes"] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold + properties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @@ -191,6 +198,37 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check + elsif tag == Constants::INSIGHTSMETRICS_FLUENT_TAG + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + end + end else return [] #end if block for object type check end From 1bad74fb9fb4d3784714e189ebc14c41ce6cb915 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 14:38:12 -0700 Subject: [PATCH 06/36] add pv_used_bytes to mdm filter metrics conf --- build/linux/installer/conf/container.conf | 2 +- build/linux/installer/conf/kube.conf | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 12 +++++++----- source/plugins/ruby/MdmMetricsGenerator.rb | 3 ++- source/plugins/ruby/filter_cadvisor2mdm.rb | 3 ++- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f02ec0131..8988c24bd 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pv_used_bytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 9ada8425f..50ecb3a6a 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pv_used_bytes log_level info diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index c4b300e9d..df175d700 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -19,7 +19,7 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this configmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = ["kube-system"] + exclude_namespaces = [] [log_collection_settings.stderr] # Default value for enabled is true @@ -28,24 +28,24 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this cofigmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = ["kube-system"] + exclude_namespaces = [] [log_collection_settings.env_var] # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = false + enabled = true # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected - enabled = false + enabled = true # When this is enabled (enabled = true), all kube events including normal events will be collected [log_collection_settings.collect_kube_system_pv_metrics] # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = false + enabled = true # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- @@ -106,6 +106,8 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 80.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 9c0873602..3b801e2de 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -377,11 +377,12 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + #pvUsagePercentageThreshold = 80.0 pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) - metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = pvUsagePercentageThresholdFloat + metric_threshold_hash["pv_used_bytes"] = pvUsagePercentageThresholdFloat end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 9fb0af5a2..35a68ba2e 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,pv_used_bytes" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,'pv_used_bytes'" @@hostName = (OMS::Common.get_hostname) @@ -129,6 +129,7 @@ def flushMetricTelemetry def filter(tag, time, record) begin + @log.info "Tag: #{tag}" if @process_incoming_stream object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] From 7068629b2058f4f4851b212b42d1789ad420b8bd Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 15:18:13 -0700 Subject: [PATCH 07/36] filter fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 7 ++++++- source/plugins/ruby/in_cadvisor_perf.rb | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 35a68ba2e..ce5880b03 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,6 +135,10 @@ def filter(tag, time, record) counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] percentage_metric_value = 0.0 metric_value = record["DataItems"][0]["Collections"][0]["Value"] + data_type = record["DataType"] + ip_name = record["IPName"] + @log.info "Data Type: #{data_type}" + @log.info "IP Name: #{data_type}" if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -199,9 +203,10 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check - elsif tag == Constants::INSIGHTSMETRICS_FLUENT_TAG + elsif data_type == "INSIGHTS_METRICS_BLOB" @log.info "insights metrics in filter_cadvisor2mdm" record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" if dataItem["Name"] == "pv_used_bytes" @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index a44365e9d..b706ff00a 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -88,6 +88,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") From 94348cd80ef4ab1607b39ec205a24d52f2eaa11c Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 16:16:59 -0700 Subject: [PATCH 08/36] more filter fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 74 +++++++++++----------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index ce5880b03..6ab723634 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -131,14 +131,48 @@ def filter(tag, time, record) begin @log.info "Tag: #{tag}" if @process_incoming_stream - object_name = record["DataItems"][0]["ObjectName"] - counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] - percentage_metric_value = 0.0 - metric_value = record["DataItems"][0]["Collections"][0]["Value"] data_type = record["DataType"] ip_name = record["IPName"] @log.info "Data Type: #{data_type}" @log.info "IP Name: #{data_type}" + + if data_type == "INSIGHTS_METRICS_BLOB" + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + end + end + + object_name = record["DataItems"][0]["ObjectName"] + counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] + percentage_metric_value = 0.0 + metric_value = record["DataItems"][0]["Collections"][0]["Value"] if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -203,38 +237,6 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check - elsif data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" - @log.info "pv_used_bytes is a data item" - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] - if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity - @log.info "capacity is not 0" - end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] - @log.info "resourceDimensions: #{resourceDimensions}" - - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - end - end else return [] #end if block for object type check end From 58230fd459370af2aec88fa8c726a1b2b3744020 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 16:19:58 -0700 Subject: [PATCH 09/36] end statement fix --- source/plugins/ruby/filter_cadvisor2mdm.rb | 53 +++++++++++----------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 6ab723634..031176126 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,37 +135,38 @@ def filter(tag, time, record) ip_name = record["IPName"] @log.info "Data Type: #{data_type}" @log.info "IP Name: #{data_type}" - + if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" - @log.info "pv_used_bytes is a data item" - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] - if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity - @log.info "capacity is not 0" - end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] - @log.info "resourceDimensions: #{resourceDimensions}" - - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, thresholdPercentage) + end end end From f68c04a1d1ad21a633d307d8992d160b849677be Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 09:03:12 -0700 Subject: [PATCH 10/36] log fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 031176126..2633b31ac 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -149,7 +149,7 @@ def filter(tag, time, record) percentage_metric_value = (usage) * 100 / capacity @log.info "capacity is not 0" end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" resourceDimensions = {} From 46c1b50a6fe31d0d83ee63b5a8e105eedaa3587a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 12:10:12 -0700 Subject: [PATCH 11/36] all pv records to mdm --- source/plugins/ruby/MdmMetricsGenerator.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3b801e2de..c250cf7c7 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -251,6 +251,7 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } + @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 2633b31ac..c3c2b49be 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -138,6 +138,8 @@ def filter(tag, time, record) if data_type == "INSIGHTS_METRICS_BLOB" @log.info "insights metrics in filter_cadvisor2mdm" + @log.info "#{record["DataItems"]}" + mdmMetrics = [] record["DataItems"].each do |dataItem| @log.info "dataItem: #{dataItem}" if dataItem["Name"] == "pv_used_bytes" @@ -152,21 +154,19 @@ def filter(tag, time, record) @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + resourceDimensions = [dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME], + "podName", "controllerName", dataItem["Tags"]["podNamespace"]].join("~~") @log.info "resourceDimensions: #{resourceDimensions}" thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + mdmMetrics.push(MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, - thresholdPercentage) + thresholdPercentage)) end + return mdmMetrics end end From db24b0fe889207610c6ce9eadde90f5363563a5e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 13:04:21 -0700 Subject: [PATCH 12/36] different mdm generator method --- source/plugins/ruby/MdmMetricsGenerator.rb | 27 ++++++++++++++++++++++ source/plugins/ruby/filter_cadvisor2mdm.rb | 9 ++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index c250cf7c7..1555f75b0 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,6 +35,7 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, + "pv_used_bytes" => "pv_usage_percentage" } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @@ -260,6 +261,32 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + records = [] + begin + @log.info "resource dimensions: #{dims}" + # get dimension values + containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + podNamespace = dims["podNamespace"] + resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { + timestamp: recordTimeStamp, + metricName: @@container_metric_name_metric_percentage_name_hash[metricName], + containerNameDimValue: containerName, + podNameDimValue: "podName", + controllerNameDimValue: "controllerName", + namespaceDimValue: podNamespace, + containerResourceUtilizationPercentage: percentageMetricValue, + thresholdPercentageDimValue: thresholdPercentage, + } + @log.info "resourceUtilRecord: #{resourceUtilRecord}" + records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + rescue => errorStr + @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return records + end + def getDiskUsageMetricRecords(record) records = [] usedPercent = nil diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index c3c2b49be..73a280fa7 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -148,19 +148,18 @@ def filter(tag, time, record) usage = dataItem["Value"] capacity = dataItem["Tags"]["pv_capacity_bytes"] if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity + percentage_metric_value = (usage * 100.0) / capacity @log.info "capacity is not 0" end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - resourceDimensions = [dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME], - "podName", "controllerName", dataItem["Tags"]["podNamespace"]].join("~~") - @log.info "resourceDimensions: #{resourceDimensions}" + resourceDimensions = dataItem["Tags"] + @log.info "#{resourceDimensions}" thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - mdmMetrics.push(MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, From 9d6874fcfe834b1c1fdabe26d011a33d990eec70 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 17:24:38 -0700 Subject: [PATCH 13/36] out_mdm log path --- source/plugins/ruby/out_mdm.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index d801edb9a..bd36662b5 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -51,6 +51,7 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName + @log = Logger.new("/var/opt/microsoft/docker-cimprov/log/out_mdm.log", 1, 5000000) super end From cdf96a023c6e1fe0461bae655b87cf145b2ac5db Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 26 Aug 2020 12:19:34 -0700 Subject: [PATCH 14/36] try to get out_mdm logging path --- source/plugins/ruby/filter_cadvisor2mdm.rb | 3 +++ source/plugins/ruby/out_mdm.rb | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 73a280fa7..097b7df35 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -120,6 +120,7 @@ def flushMetricTelemetry @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false end rescue => errorStr @log.info "Error in flushMetricTelemetry: #{errorStr}" @@ -165,6 +166,8 @@ def filter(tag, time, record) resourceDimensions, thresholdPercentage)) end + flushMetricTelemetry + setThresholdExceededTelemetry(metricName) return mdmMetrics end end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index bd36662b5..f658af612 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -4,6 +4,7 @@ module Fluent class OutputMDM < BufferedOutput config_param :retry_mdm_post_wait_minutes, :integer + config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/out_mdm.log" Plugin.register_output("out_mdm", self) @@ -51,7 +52,7 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName - @log = Logger.new("/var/opt/microsoft/docker-cimprov/log/out_mdm.log", 1, 5000000) + @log = Logger.new(@log_path, 1, 5000000) super end From c902df6155b39c46d2db8a65fa1e482b9b54e7ad Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 26 Aug 2020 16:43:50 -0700 Subject: [PATCH 15/36] pv metric now sending to ME --- source/plugins/ruby/filter_cadvisor2mdm.rb | 4 +++- source/plugins/ruby/out_mdm.rb | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 097b7df35..bf8c53cb0 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -168,7 +168,7 @@ def filter(tag, time, record) end flushMetricTelemetry setThresholdExceededTelemetry(metricName) - return mdmMetrics + return mdmMetrics[0] end end @@ -304,7 +304,9 @@ def filter_stream(tag, es) es.each { |time, record| filtered_records = filter(tag, time, record) + @log.info "filtered records: #{filtered_records}" filtered_records.each { |filtered_record| + @log.info "filtered_record: #{filtered_record}" new_es.add(time, filtered_record) if filtered_record } if filtered_records } diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index f658af612..91563e100 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -14,6 +14,7 @@ def initialize require "net/https" require "uri" require "yajl/json_gem" + require "logger" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "constants" @@ -187,7 +188,7 @@ def write_status_file(success, message) # Convert the event to a raw string. def format(tag, time, record) if record != {} - @log.trace "Buffering #{tag}" + #@log.trace "Buffering #{tag}" return [tag, record].to_msgpack else return "" @@ -236,6 +237,7 @@ def send_to_mdm(post_body) request.body = post_body.join("\n") @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" response = @http_client.request(request) + @log.info "REQUEST RESPONSE: #{response}" response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now From 0f41269f99f388cc0067408988380fb698502a15 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 09:32:08 -0700 Subject: [PATCH 16/36] add in threshold condition --- source/plugins/ruby/filter_cadvisor2mdm.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index bf8c53cb0..1756b1da4 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -160,11 +160,13 @@ def filter(tag, time, record) thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + if percentage_metric_value >= thresholdPercentage + mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, thresholdPercentage)) + end end flushMetricTelemetry setThresholdExceededTelemetry(metricName) From d4148cc5097ec18a45afb1f44940c6ccc832484e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 11:49:28 -0700 Subject: [PATCH 17/36] constants and consistent naming --- build/linux/installer/conf/container.conf | 2 +- build/linux/installer/conf/kube.conf | 2 +- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 10 +++++----- source/plugins/ruby/MdmMetricsGenerator.rb | 4 ++-- source/plugins/ruby/constants.rb | 6 ++++++ source/plugins/ruby/filter_cadvisor2mdm.rb | 10 +++++----- source/plugins/ruby/in_win_cadvisor_perf.rb | 1 + 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index 8988c24bd..e55c62fbc 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pv_used_bytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 50ecb3a6a..ba40b7a35 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pv_used_bytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 228ff6ad9..b06e37e21 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -303,7 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", "pv_used_bytes", metricTime)) + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -372,10 +372,10 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags["pvName"] = pvName - metricTags["pvcName"] = pvcName - metricTags["pv_capacity_bytes"] = volume["capacityBytes"] - metricTags["podNamespace"] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 1555f75b0..dbcf84772 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,7 +35,7 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, - "pv_used_bytes" => "pv_usage_percentage" + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @@ -410,7 +410,7 @@ def getContainerResourceUtilizationThresholds @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) - metric_threshold_hash["pv_used_bytes"] = pvUsagePercentageThresholdFloat + metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 91dfc6077..3295c8823 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,10 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" #Kubestate (common) @@ -45,6 +49,7 @@ class Constants MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -56,6 +61,7 @@ class Constants CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 1756b1da4..865f8bce6 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,'pv_used_bytes'" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -90,7 +90,7 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true - elsif metricName == "pv_used_bytes" + elsif metricName == Constants::PV_USED_BYTES @pvExceededUsageThreshold = true end rescue => errorStr @@ -109,7 +109,7 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] - properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash["pv_used_bytes"] + properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold @@ -143,11 +143,11 @@ def filter(tag, time, record) mdmMetrics = [] record["DataItems"].each do |dataItem| @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" + if dataItem["Name"] == Constants::PV_USED_BYTES @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity @log.info "capacity is not 0" diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 38868f2f5..4e90195e5 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -101,6 +101,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end From 1d6cee6e44bdd5d9b0bca3f823230af5cfa4c858 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 12:34:36 -0700 Subject: [PATCH 18/36] comments and code cleanup --- .../scripts/tomlparser-mdm-metrics-config.rb | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 10 +++--- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 23 ++++--------- source/plugins/ruby/MdmMetricsGenerator.rb | 9 +++-- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 33 ++++++++----------- 6 files changed, 30 insertions(+), 48 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index d2990ca0c..5a90b4b04 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -67,7 +67,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #PV + #Persistent Volume & Persistent Volume Claim pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index df175d700..58c9cdcd1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -19,7 +19,7 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this configmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = [] + exclude_namespaces = ["kube-system"] [log_collection_settings.stderr] # Default value for enabled is true @@ -28,24 +28,24 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this cofigmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = [] + exclude_namespaces = ["kube-system"] [log_collection_settings.env_var] # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = true + enabled = ffalse # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected - enabled = true + enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected [log_collection_settings.collect_kube_system_pv_metrics] # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = true + enabled = false # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index b06e37e21..b30a79ff8 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -315,14 +315,6 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) - @Log.info("Getting PV metrics") - pvKubeSystem = @pvKubeSystemCollectionMetricsEnabled.nil? ? "pv kube-system nil" : "pv kube-system not nil" - @Log.info(pvKubeSystem) - @Log.info(@pvKubeSystemCollectionMetricsEnabled) - - pvKubeSystemCollectionMetrics = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] - @Log.info(pvKubeSystemCollectionMetrics) - metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -333,20 +325,17 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podName = pod["podRef"]["name"] podNamespace = pod["podRef"]["namespace"] - kubeSystemNamespace = false + excludeNamespace = false if (podNamespace.include? "kube-system") - @Log.info("kube-system namespace encountered") - if (pvKubeSystemCollectionMetrics == "true") - kubeSystemNamespace = false - @Log.info("kube-system namespace encountered - include") + if (@pvKubeSystemCollectionMetricsEnabled == "true") + excludeNamespace = false else - kubeSystemNamespace = true - @Log.info("kube-system namespace encountered - exclude") + excludeNamespace = true end end - if (!pod["containers"].nil? && !kubeSystemNamespace) + if (!pod["containers"].nil? && !excludeNamespace) pod["containers"].each do |container| containerName = container["name"] @@ -366,7 +355,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = "container.azm.ms/pv" + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index dbcf84772..e9953d0c1 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -264,10 +264,10 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) records = [] begin - @log.info "resource dimensions: #{dims}" - # get dimension values containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - podNamespace = dims["podNamespace"] + podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] + + # Will need a different MDM Template resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { timestamp: recordTimeStamp, metricName: @@container_metric_name_metric_percentage_name_hash[metricName], @@ -384,7 +384,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD - metric_threshold_hash["pvUsage"] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -405,7 +405,6 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end - #pvUsagePercentageThreshold = 80.0 pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 3295c8823..183cbc415 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,7 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 865f8bce6..700e90a9e 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -133,46 +133,39 @@ def filter(tag, time, record) @log.info "Tag: #{tag}" if @process_incoming_stream data_type = record["DataType"] - ip_name = record["IPName"] - @log.info "Data Type: #{data_type}" - @log.info "IP Name: #{data_type}" if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - @log.info "#{record["DataItems"]}" mdmMetrics = [] record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == Constants::PV_USED_BYTES - @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] usage = dataItem["Value"] capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity - @log.info "capacity is not 0" end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" resourceDimensions = dataItem["Tags"] - @log.info "#{resourceDimensions}" - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" + + flushMetricTelemetry if percentage_metric_value >= thresholdPercentage - mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, - thresholdPercentage)) - end - end - flushMetricTelemetry - setThresholdExceededTelemetry(metricName) - return mdmMetrics[0] - end - end + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + end # end if block for insights metrics check object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] From 357914ad7b1101cd4e42f6ab6babe9341a1b276d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 12:03:45 -0700 Subject: [PATCH 19/36] remove container name, add pod name/uid --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 63 ++++++++--------- source/plugins/ruby/MdmAlertTemplates.rb | 34 ++++++++++ source/plugins/ruby/MdmMetricsGenerator.rb | 13 ++-- source/plugins/ruby/constants.rb | 2 + source/plugins/ruby/filter_cadvisor2mdm.rb | 68 ++++++++++--------- 6 files changed, 111 insertions(+), 71 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58c9cdcd1..083263baf 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -107,7 +107,7 @@ data: # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage - pv_usage_threshold_percentage = 80.0 + pv_usage_threshold_percentage = 0.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index b30a79ff8..2c973ad2d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -335,41 +335,38 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m end - if (!pod["containers"].nil? && !excludeNamespace) - pod["containers"].each do |container| - containerName = container["name"] - - if (!pod["volume"].nil?) - pod["volume"].each do |volume| - if (!volume["pvcRef"].nil?) - pvcRef = volume["pvcRef"] - if (!pvcRef["name"].nil?) - - # A PVC exists on this volume - pvcName = pvcRef["name"] - pvName = volume["name"] - - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + if (!excludeNamespace) + if (!pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - end + metricItems.push(metricItem) end end end diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index 2e516a99d..d55435c1e 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -90,6 +90,40 @@ class MdmAlertTemplates } }' + PV_resource_utilization_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolume", + "dimNames": [ + "podUID", + "podName", + "computerName", + "Kubernetes namespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podUidDimValue}", + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{containerResourceUtilizationPercentage}, + "max": %{containerResourceUtilizationPercentage}, + "sum": %{containerResourceUtilizationPercentage}, + "count": 1 + } + ] + } + } + }' + + Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index e9953d0c1..d09a52bab 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -261,19 +261,20 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end - def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] + podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] + podUid = dims[INSIGHTSMETRICS_TAGS_POD_UID] - # Will need a different MDM Template - resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { + resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, metricName: @@container_metric_name_metric_percentage_name_hash[metricName], - containerNameDimValue: containerName, - podNameDimValue: "podName", - controllerNameDimValue: "controllerName", + podUidDimValue: podUid, + podNameDimValue: podName, + nodeNameDimValue: computer, namespaceDimValue: podNamespace, containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 183cbc415..493f098c6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,9 +13,11 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUID" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 700e90a9e..592fd5da7 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,37 +135,8 @@ def filter(tag, time, record) data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" - mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] - - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items - return [] - end # end if block for insights metrics check + return filterPVInsightsMetrics(record) + end object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] @@ -248,6 +219,41 @@ def filter(tag, time, record) end end + def filterPVInsightsMetrics(record) + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + end + def ensure_cpu_memory_capacity_set if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" From 9377262b03d8607f0d270ba05cb3569c640fb75d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 12:59:59 -0700 Subject: [PATCH 20/36] log fixes and constnat change --- source/plugins/ruby/MdmMetricsGenerator.rb | 4 ++-- source/plugins/ruby/constants.rb | 2 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index d09a52bab..b5086a744 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -267,7 +267,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] - podUid = dims[INSIGHTSMETRICS_TAGS_POD_UID] + podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, @@ -282,7 +282,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr - @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" + @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return records diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 493f098c6..299b1c248 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -68,7 +68,7 @@ class Constants DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 - DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 80.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 592fd5da7..7353d9050 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,6 +135,8 @@ def filter(tag, time, record) data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" + @log.info "Insights Metrics" + @log.info "record: #{record}" return filterPVInsightsMetrics(record) end From ee14b2bdfd9fdab9d9b00362a5c53f8d4b5a8e3f Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 13:49:06 -0700 Subject: [PATCH 21/36] naming fix --- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index b5086a744..329d91813 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -274,7 +274,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen metricName: @@container_metric_name_metric_percentage_name_hash[metricName], podUidDimValue: podUid, podNameDimValue: podName, - nodeNameDimValue: computer, + computerNameDimValue: computer, namespaceDimValue: podNamespace, containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, From c1d46e8a7a9818b00b9b08b77aa1b25f0588a129 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 14:55:58 -0700 Subject: [PATCH 22/36] cleanup --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 9 ++------- source/plugins/ruby/MdmMetricsGenerator.rb | 3 --- source/plugins/ruby/filter_cadvisor2mdm.rb | 8 ++------ source/plugins/ruby/out_mdm.rb | 6 +----- 5 files changed, 6 insertions(+), 22 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 083263baf..fe80539d4 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -35,7 +35,7 @@ data: enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = ffalse + enabled = false # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 2c973ad2d..8d65c16ea 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -326,15 +326,10 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podNamespace = pod["podRef"]["namespace"] excludeNamespace = false - if (podNamespace.include? "kube-system") - if (@pvKubeSystemCollectionMetricsEnabled == "true") - excludeNamespace = false - else - excludeNamespace = true - end + if (podNamespace.include? "kube-system" && @pvKubeSystemCollectionMetricsEnabled == "false") + excludeNamespace = true end - if (!excludeNamespace) if (!pod["volume"].nil?) pod["volume"].each do |volume| diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 329d91813..e22660c71 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -252,7 +252,6 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } - @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" @@ -279,7 +278,6 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } - @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" @@ -407,7 +405,6 @@ def getContainerResourceUtilizationThresholds end pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] - @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 7353d9050..33291452c 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -130,13 +130,11 @@ def flushMetricTelemetry def filter(tag, time, record) begin - @log.info "Tag: #{tag}" if @process_incoming_stream - data_type = record["DataType"] + # Check if insights metrics for PV metrics + data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "Insights Metrics" - @log.info "record: #{record}" return filterPVInsightsMetrics(record) end @@ -307,9 +305,7 @@ def filter_stream(tag, es) es.each { |time, record| filtered_records = filter(tag, time, record) - @log.info "filtered records: #{filtered_records}" filtered_records.each { |filtered_record| - @log.info "filtered_record: #{filtered_record}" new_es.add(time, filtered_record) if filtered_record } if filtered_records } diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 91563e100..d801edb9a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -4,7 +4,6 @@ module Fluent class OutputMDM < BufferedOutput config_param :retry_mdm_post_wait_minutes, :integer - config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/out_mdm.log" Plugin.register_output("out_mdm", self) @@ -14,7 +13,6 @@ def initialize require "net/https" require "uri" require "yajl/json_gem" - require "logger" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "constants" @@ -53,7 +51,6 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName - @log = Logger.new(@log_path, 1, 5000000) super end @@ -188,7 +185,7 @@ def write_status_file(success, message) # Convert the event to a raw string. def format(tag, time, record) if record != {} - #@log.trace "Buffering #{tag}" + @log.trace "Buffering #{tag}" return [tag, record].to_msgpack else return "" @@ -237,7 +234,6 @@ def send_to_mdm(post_body) request.body = post_body.join("\n") @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" response = @http_client.request(request) - @log.info "REQUEST RESPONSE: #{response}" response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now From 130e5d72e79b37fe1bb2ea067200fdbc81ffa1fc Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 15:39:59 -0700 Subject: [PATCH 23/36] add pvUsedBytes as metric to collect --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index db788a37e..128f68697 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -125,7 +125,7 @@ data: type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info From d0f8d58e59764fca68355c5a71b1fb8c4b978645 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 15:41:05 -0700 Subject: [PATCH 24/36] more cleanup --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 64 +++++++++---------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 8d65c16ea..7f06ba9d3 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -330,48 +330,46 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m excludeNamespace = true end - if (!excludeNamespace) - if (!pod["volume"].nil?) - pod["volume"].each do |volume| - if (!volume["pvcRef"].nil?) - pvcRef = volume["pvcRef"] - if (!pvcRef["name"].nil?) - - # A PVC exists on this volume - pvcName = pvcRef["name"] - pvName = volume["name"] - - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + if (!excludeNamespace && !pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - end + metricItems.push(metricItem) end end end end end rescue => errorStr - @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems - end + end return metricItems end From f0885e49a92989b3bfd50295f028658826c7abaf Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 16:28:35 -0700 Subject: [PATCH 25/36] boolean fix --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 7f06ba9d3..3f5d5bb5d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -326,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podNamespace = pod["podRef"]["namespace"] excludeNamespace = false - if (podNamespace.include? "kube-system" && @pvKubeSystemCollectionMetricsEnabled == "false") + if (podNamespace.include? "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" excludeNamespace = true end From 62b84baa058168d77257d7e95bf09f0e3348ed34 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 17:03:47 -0700 Subject: [PATCH 26/36] set threshold to 60 --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index fe80539d4..6974d09c6 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -107,7 +107,7 @@ data: # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage - pv_usage_threshold_percentage = 0.0 + pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false From 8fca1274092aea55ba8d1c50928ae222e0da6189 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 3 Sep 2020 12:38:24 -0700 Subject: [PATCH 27/36] add check that pvUsedBytes is a configured metric to collect --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 33291452c..4ab3a0310 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -223,7 +223,7 @@ def filterPVInsightsMetrics(record) mdmMetrics = [] record["DataItems"].each do |dataItem| - if dataItem["Name"] == Constants::PV_USED_BYTES + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) metricName = dataItem["Name"] usage = dataItem["Value"] capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] From da0a34d688f1e04b91416e2ba042a53e1851a65e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 10:13:08 -0700 Subject: [PATCH 28/36] code review feedback changes --- build/common/installer/scripts/tomlparser.rb | 14 ---- .../installer/datafiles/base_container.data | 1 + .../scripts/tomlparser-mdm-metrics-config.rb | 16 +++- .../tomlparser-metric-collection-config.rb | 79 +++++++++++++++++++ kubernetes/container-azm-ms-agentconfig.yaml | 14 ++-- kubernetes/linux/main.sh | 8 ++ .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 22 +++--- source/plugins/ruby/MdmAlertTemplates.rb | 14 ++-- source/plugins/ruby/MdmMetricsGenerator.rb | 8 +- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 66 +++++++++------- 11 files changed, 170 insertions(+), 73 deletions(-) create mode 100644 build/linux/installer/scripts/tomlparser-metric-collection-config.rb diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 51c0d7b13..7235ee0c3 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -24,7 +24,6 @@ @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false @collectAllKubeEvents = false -@collectPVKubeSystemMetrics = false @containerLogsRoute = "" # Use parser to parse the configmap toml file to a ruby structure @@ -149,16 +148,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") end - #Get PV kube-system enrichment setting - begin - if !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? - @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] - puts "config::Using config map setting for PV kube-system collection" - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") - end - #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? @@ -210,7 +199,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") - file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") # Close file after writing all environment variables file.close @@ -256,8 +244,6 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS', @collectPVKubeSystemMetrics) - file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..ca2538b79 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 5a90b4b04..dd9a582b9 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -67,12 +67,20 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #Persistent Volume & Persistent Volume Claim + #Persistent Volume + noPVConfig = false pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] - pvUsageThresholdFloat = pvUsageThreshold.to_f - if pvUsageThresholdFloat.kind_of? Float - @percentagePVUsageThreshold = pvUsageThresholdFloat + if !pvUsageThreshold.nil? + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + else + noPVConfig = true + end else + noPVConfig = true + end + if (noPVConfig) puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb new file mode 100644 index 000000000..c48c08cd8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -0,0 +1,79 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require_relative "microsoft/omsagent/plugin/constants" +require_relative "../../../../source/plugins/ruby/ApplicationInsightsUtility.rb" + +@configMapMountPath = "/etc/config/settings/metric_collection_settings" +@configVersion = "" +@configSchemaVersion = "" + +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectPVKubeSystemMetrics = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings +def populateSettingValuesFromConfigMap(parsedConfig) + # Get metric collection settings for including or excluding kube-system namespace in PV metrics + begin + if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") + end + + begin + if @collectPVKubeSystemMetrics + ApplicationInsightsUtility.sendCustomEvent("CollectPVKubeSystemMetricsEnabled", {}) + end + rescue => errorStr + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Metric Collection Settings Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_metric_collection_env_var", "w") + +if !file.nil? + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") + # Close file after writing all metric collection setting environment variables + file.close + puts "****************End Metric Collection Settings Processing********************" +else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End Metric Collection Settings Processing********************" +end diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 6974d09c6..b6e1364ad 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,11 +42,6 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected - [log_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false - # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = false - # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -96,6 +91,15 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 311470660..d9fdc42e9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do done source config_mdm_metrics_env_var +#Parse the configmap to set the right environment variables for metric collection settings +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + +cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc +done +source config_metric_collection_env_var + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 3f5d5bb5d..afde3401b 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -303,7 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) + metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -314,29 +314,29 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end - def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName begin metricInfo = metricJSON metricInfo["pods"].each do |pod| - podUid = pod["podRef"]["uid"] - podName = pod["podRef"]["name"] - podNamespace = pod["podRef"]["namespace"] - excludeNamespace = false - if (podNamespace.include? "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" - excludeNamespace = true + podNamespace = pod["podRef"]["namespace"] + includeNamespace = false + if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" + includeNamespace = true end - if (!excludeNamespace && !pod["volume"].nil?) + if (!includeNamespace && !pod["volume"].nil?) pod["volume"].each do |volume| if (!volume["pvcRef"].nil?) pvcRef = volume["pvcRef"] if (!pvcRef["name"].nil?) # A PVC exists on this volume + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] pvcName = pvcRef["name"] pvName = volume["name"] @@ -344,7 +344,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] + metricItem["Value"] = volume[metricNasmeToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE @@ -367,7 +367,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m end end rescue => errorStr - @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end return metricItems diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index d55435c1e..d5107fea1 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -96,26 +96,24 @@ class MdmAlertTemplates "data": { "baseData": { "metric": "%{metricName}", - "namespace": "insights.container/persistentvolume", + "namespace": "insights.container/persistentvolumes", "dimNames": [ - "podUID", "podName", - "computerName", - "Kubernetes namespace", + "node", + "kubernetesNamespace", "thresholdPercentage" ], "series": [ { "dimValues": [ - "%{podUidDimValue}", "%{podNameDimValue}", "%{computerNameDimValue}", "%{namespaceDimValue}", "%{thresholdPercentageDimValue}" ], - "min": %{containerResourceUtilizationPercentage}, - "max": %{containerResourceUtilizationPercentage}, - "sum": %{containerResourceUtilizationPercentage}, + "min": %{pvResourceUtilizationPercentage}, + "max": %{pvResourceUtilizationPercentage}, + "sum": %{pvResourceUtilizationPercentage}, "count": 1 } ] diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index e22660c71..662a10322 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,6 +35,9 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, + } + + @@pod_metric_name_metric_percentage_name_hash = { Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC } @@ -270,12 +273,11 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, - metricName: @@container_metric_name_metric_percentage_name_hash[metricName], - podUidDimValue: podUid, + metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], podNameDimValue: podName, computerNameDimValue: computer, namespaceDimValue: podNamespace, - containerResourceUtilizationPercentage: percentageMetricValue, + pvResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 299b1c248..9aea6eb3a 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -81,6 +81,7 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_METRICS_HEART_BEAT_EVENT = "PVUtilMdmHeartBeatEvent" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 4ab3a0310..edd26603b 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -57,7 +57,6 @@ def start if @process_incoming_stream @cpu_capacity = 0.0 @memory_capacity = 0.0 - @pv_capacity = 0.0 ensure_cpu_memory_capacity_set @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @@ -109,13 +108,18 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] - properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold - properties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) + + # Also send for PV usage metrics + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENTT, pvProperties) + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @@ -220,38 +224,44 @@ def filter(tag, time, record) end def filterPVInsightsMetrics(record) - mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + begin + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - computer = dataItem["Computer"] - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, computer, percentage_metric_value, resourceDimensions, thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items - return [] + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + rescue Exception => e + @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] #return empty array if we ran into any errors + end end def ensure_cpu_memory_capacity_set From 68404bf374776a0067d7d8b8edf3b39808fda7cb Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 14:48:59 -0700 Subject: [PATCH 29/36] after testing changes --- .../scripts/tomlparser-mdm-metrics-config.rb | 42 +++++++++++-------- .../tomlparser-metric-collection-config.rb | 10 +---- kubernetes/container-azm-ms-agentconfig.yaml | 5 ++- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 23 ++++++++-- source/plugins/ruby/constants.rb | 3 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 26 ++++++++---- 6 files changed, 69 insertions(+), 40 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index dd9a582b9..04d664289 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -36,7 +36,7 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings def populateSettingValuesFromConfigMap(parsedConfig) if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil? - # Get mdm metrics config settings for resource utilization + # Get mdm metrics config settings for container resource utilization begin resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds] if !resourceUtilization.nil? @@ -67,30 +67,38 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #Persistent Volume - noPVConfig = false - pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] + puts "config::Using config map settings for MDM metric configuration settings for container resource utilization" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") + @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD + @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD + @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + end + + # Get mdm metrics config settings for PV utilization + begin + usingPVThresholdConfig = false + pvUtilization = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + if !pvUtilization.nil? + pvUsageThreshold = pvUtilization[:pv_usage_threshold_percentage] if !pvUsageThreshold.nil? pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float @percentagePVUsageThreshold = pvUsageThresholdFloat - else - noPVConfig = true + usingPVThresholdConfig = true end - else - noPVConfig = true - end - if (noPVConfig) - puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " - @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end - puts "config::Using config map settings for MDM metric configuration settings for resource utilization" + end + + if usingPVThresholdConfig + puts "config::Using config map settings for MDM metric configuration settings for PV utilization" + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end rescue => errorStr - ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") - @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD - @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD - @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end end diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb index c48c08cd8..40d87b7f1 100644 --- a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -4,7 +4,6 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" require_relative "microsoft/omsagent/plugin/constants" -require_relative "../../../../source/plugins/ruby/ApplicationInsightsUtility.rb" @configMapMountPath = "/etc/config/settings/metric_collection_settings" @configVersion = "" @@ -37,19 +36,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) # Get metric collection settings for including or excluding kube-system namespace in PV metrics begin if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? - @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled] puts "config::Using config map setting for PV kube-system collection" end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") end - - begin - if @collectPVKubeSystemMetrics - ApplicationInsightsUtility.sendCustomEvent("CollectPVKubeSystemMetricsEnabled", {}) - end - rescue => errorStr - end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index b6e1364ad..aec1bb456 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -110,7 +110,10 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 - # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index afde3401b..3355a11db 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -54,6 +54,7 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} @@ -315,6 +316,9 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + telemetryimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 + metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -323,12 +327,12 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricInfo["pods"].each do |pod| podNamespace = pod["podRef"]["namespace"] - includeNamespace = false + excludeNamespace = false if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" - includeNamespace = true + excludeNamespace = true end - if (!includeNamespace && !pod["volume"].nil?) + if (!excludeNamespace && !pod["volume"].nil?) pod["volume"].each do |volume| if (!volume["pvcRef"].nil?) pvcRef = volume["pvcRef"] @@ -344,7 +348,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNasmeToCollect] + metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE @@ -370,6 +374,17 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end + + # If kube-system metrics collection enabled, send telemetry + begin + if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true" + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {}) + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}") + end + return metricItems end diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 9aea6eb3a..692fd6e4d 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -81,7 +81,8 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" - PV_METRICS_HEART_BEAT_EVENT = "PVUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index edd26603b..6cf1e3d72 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -46,6 +46,7 @@ def start @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i # These variables keep track if any resource utilization threshold exceeded in the last 10 minutes @containersExceededCpuThreshold = false @@ -113,21 +114,30 @@ def flushMetricTelemetry properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) + @containersExceededCpuThreshold = false + @containersExceededMemRssThreshold = false + @containersExceededMemWorkingSetThreshold = false + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end - # Also send for PV usage metrics + # Also send for PV usage metrics + begin + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) pvProperties = {} pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold - ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENTT, pvProperties) - - @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i - @containersExceededCpuThreshold = false - @containersExceededMemRssThreshold = false - @containersExceededMemWorkingSetThreshold = false + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - @log.info "Error in flushMetricTelemetry: #{errorStr}" + @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end From 4505b452f4e0bec5e80f6a2e40fd5ce1e19b639d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 16:21:05 -0700 Subject: [PATCH 30/36] whitespace fix --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 6cf1e3d72..3bc674ea8 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -244,7 +244,7 @@ def filterPVInsightsMetrics(record) capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity - end + end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" From c08054b55886993ee95e1c38ba3be0d27bb93884 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 16:32:20 -0700 Subject: [PATCH 31/36] variable name fix --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 3355a11db..d815644c2 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -316,7 +316,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) - telemetryimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 metricItems = [] From c88b9ab907fd762435f8ae67be34968ade3082d6 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 8 Sep 2020 11:37:00 -0700 Subject: [PATCH 32/36] naming changes --- .../installer/scripts/tomlparser-mdm-metrics-config.rb | 10 +++++----- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 4 ++-- source/plugins/ruby/constants.rb | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 04d664289..74f1c0726 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -78,20 +78,20 @@ def populateSettingValuesFromConfigMap(parsedConfig) # Get mdm metrics config settings for PV utilization begin - usingPVThresholdConfig = false - pvUtilization = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + isUsingPVThresholdConfig = false + pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] if !pvUtilization.nil? - pvUsageThreshold = pvUtilization[:pv_usage_threshold_percentage] + pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] if !pvUsageThreshold.nil? pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float @percentagePVUsageThreshold = pvUsageThresholdFloat - usingPVThresholdConfig = true + isUsingPVThresholdConfig = true end end end - if usingPVThresholdConfig + if isUsingPVThresholdConfig puts "config::Using config map settings for MDM metric configuration settings for PV utilization" else puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index d815644c2..bd1cd1000 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -342,7 +342,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric podUid = pod["podRef"]["uid"] podName = pod["podRef"]["name"] pvcName = pvcRef["name"] - pvName = volume["name"] + volumeName = volume["name"] metricItem = {} metricItem["CollectionTime"] = metricPollTime @@ -357,7 +357,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volumeName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 692fd6e4d..5409a21b6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -15,7 +15,7 @@ class Constants INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" INSIGHTSMETRICS_TAGS_POD_UID = "podUID" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" - INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" + INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" INSIGHTSMETRICS_TAGS_POD_NAME = "podName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" From 668a69140357710ffa7e05479a04666f5211aa56 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 21 Sep 2020 13:06:05 -0700 Subject: [PATCH 33/36] match inventory schema naming --- .../linux/installer/scripts/tomlparser-mdm-metrics-config.rb | 2 +- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 5 ++--- source/plugins/ruby/constants.rb | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 74f1c0726..345c51633 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -80,7 +80,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) begin isUsingPVThresholdConfig = false pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] - if !pvUtilization.nil? + if !pvUtilizationThresholds.nil? pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] if !pvUsageThreshold.nil? pvUsageThresholdFloat = pvUsageThreshold.to_f diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index bd1cd1000..7661bb7a1 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -342,7 +342,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric podUid = pod["podRef"]["uid"] podName = pod["podRef"]["name"] pvcName = pvcRef["name"] - volumeName = volume["name"] + pvcNamespace = pvcRef["namespace"] metricItem = {} metricItem["CollectionTime"] = metricPollTime @@ -357,9 +357,8 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName - metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volumeName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 5409a21b6..ceca9f01b 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -15,10 +15,9 @@ class Constants INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" INSIGHTSMETRICS_TAGS_POD_UID = "podUID" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" - INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" INSIGHTSMETRICS_TAGS_POD_NAME = "podName" - INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" From bf57879e0c734dd9f40f817591a483cd660c5f04 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 21 Sep 2020 15:24:20 -0700 Subject: [PATCH 34/36] last naming fixes --- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 662a10322..d3d870eca 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -267,7 +267,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] + podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] From a01af6f9d064e5a43fbb7cfa420d2f6fd5b2e55f Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 21 Sep 2020 16:11:01 -0700 Subject: [PATCH 35/36] change podUID to podUid to match KubePodInventory --- source/plugins/ruby/MdmMetricsGenerator.rb | 4 ++-- source/plugins/ruby/constants.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index d3d870eca..1e7db37cc 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -267,7 +267,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] + pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] @@ -276,7 +276,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], podNameDimValue: podName, computerNameDimValue: computer, - namespaceDimValue: podNamespace, + namespaceDimValue: pvcNamespace, pvResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index ceca9f01b..82a6e8814 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,7 +13,7 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" - INSIGHTSMETRICS_TAGS_POD_UID = "podUID" + INSIGHTSMETRICS_TAGS_POD_UID = "podUid" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" From b9b79c8ad971dce97b83ee891f7512b8c20e2179 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 22 Sep 2020 11:22:31 -0700 Subject: [PATCH 36/36] pv alert template --- .../PVUsagePercentage.json | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 alerts/recommended_alerts_ARM/PVUsagePercentage.json diff --git a/alerts/recommended_alerts_ARM/PVUsagePercentage.json b/alerts/recommended_alerts_ARM/PVUsagePercentage.json new file mode 100644 index 000000000..e6cdbee15 --- /dev/null +++ b/alerts/recommended_alerts_ARM/PVUsagePercentage.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "alertName": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Name of the alert" + } + }, + "alertDescription": { + "type": "string", + "defaultValue": "This is a metric alert", + "metadata": { + "description": "Description of alert" + } + }, + "alertSeverity": { + "type": "int", + "defaultValue": 3, + "allowedValues": [ + 0, + 1, + 2, + 3, + 4 + ], + "metadata": { + "description": "Severity of alert {0,1,2,3,4}" + } + }, + "isEnabled": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Specifies whether the alert is enabled" + } + }, + "clusterResourceId": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz" + } + }, + "operator": { + "type": "string", + "defaultValue": "GreaterThan", + "allowedValues": [ + "Equals", + "NotEquals", + "GreaterThan", + "GreaterThanOrEqual", + "LessThan", + "LessThanOrEqual" + ], + "metadata": { + "description": "Operator comparing the current value with the threshold value." + } + }, + "threshold": { + "type": "int", + "defaultValue": 80, + "metadata": { + "description": "The threshold value at which the alert is activated." + }, + "minValue": 1, + "maxValue": 100 + }, + "timeAggregation": { + "type": "string", + "defaultValue": "Average", + "allowedValues": [ + "Average", + "Minimum", + "Maximum", + "Count" + ], + "metadata": { + "description": "How the data that is collected should be combined over time." + } + }, + "windowSize": { + "type": "string", + "defaultValue": "PT5M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H", + "PT6H", + "PT12H", + "PT24H" + ], + "metadata": { + "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format." + } + }, + "evaluationFrequency": { + "type": "string", + "defaultValue": "PT1M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H" + ], + "metadata": { + "description": "how often the metric alert is evaluated represented in ISO 8601 duration format" + } + }, + "actionGroupId": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "The ID of the action group that is triggered when the alert is activated or deactivated" + } + } + }, + "variables": {}, + "resources": [ + { + "name": "[parameters('alertName')]", + "type": "Microsoft.Insights/metricAlerts", + "location": "global", + "apiVersion": "2018-03-01", + "tags": {}, + "properties": { + "description": "[parameters('alertDescription')]", + "severity": "[parameters('alertSeverity')]", + "enabled": "[parameters('isEnabled')]", + "scopes": [ + "[parameters('clusterResourceId')]" + ], + "evaluationFrequency": "[parameters('evaluationFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria", + "allOf": [ + { + "name": "1st criterion", + "metricName": "pvUsageExceededPercentage", + "metricNamespace": "Insights.Container/persistentvolumes", + "dimensions": [ + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "[parameters('operator')]", + "threshold": "[parameters('threshold')]", + "timeAggregation": "[parameters('timeAggregation')]", + "skipMetricValidation": true + } + ] + }, + "actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]" + } + } + ] +}