diff --git a/alerts/recommended_alerts_ARM/PVUsagePercentage.json b/alerts/recommended_alerts_ARM/PVUsagePercentage.json new file mode 100644 index 000000000..e6cdbee15 --- /dev/null +++ b/alerts/recommended_alerts_ARM/PVUsagePercentage.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "alertName": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Name of the alert" + } + }, + "alertDescription": { + "type": "string", + "defaultValue": "This is a metric alert", + "metadata": { + "description": "Description of alert" + } + }, + "alertSeverity": { + "type": "int", + "defaultValue": 3, + "allowedValues": [ + 0, + 1, + 2, + 3, + 4 + ], + "metadata": { + "description": "Severity of alert {0,1,2,3,4}" + } + }, + "isEnabled": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Specifies whether the alert is enabled" + } + }, + "clusterResourceId": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz" + } + }, + "operator": { + "type": "string", + "defaultValue": "GreaterThan", + "allowedValues": [ + "Equals", + "NotEquals", + "GreaterThan", + "GreaterThanOrEqual", + "LessThan", + "LessThanOrEqual" + ], + "metadata": { + "description": "Operator comparing the current value with the threshold value." + } + }, + "threshold": { + "type": "int", + "defaultValue": 80, + "metadata": { + "description": "The threshold value at which the alert is activated." + }, + "minValue": 1, + "maxValue": 100 + }, + "timeAggregation": { + "type": "string", + "defaultValue": "Average", + "allowedValues": [ + "Average", + "Minimum", + "Maximum", + "Count" + ], + "metadata": { + "description": "How the data that is collected should be combined over time." + } + }, + "windowSize": { + "type": "string", + "defaultValue": "PT5M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H", + "PT6H", + "PT12H", + "PT24H" + ], + "metadata": { + "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format." + } + }, + "evaluationFrequency": { + "type": "string", + "defaultValue": "PT1M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H" + ], + "metadata": { + "description": "how often the metric alert is evaluated represented in ISO 8601 duration format" + } + }, + "actionGroupId": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "The ID of the action group that is triggered when the alert is activated or deactivated" + } + } + }, + "variables": {}, + "resources": [ + { + "name": "[parameters('alertName')]", + "type": "Microsoft.Insights/metricAlerts", + "location": "global", + "apiVersion": "2018-03-01", + "tags": {}, + "properties": { + "description": "[parameters('alertDescription')]", + "severity": "[parameters('alertSeverity')]", + "enabled": "[parameters('isEnabled')]", + "scopes": [ + "[parameters('clusterResourceId')]" + ], + "evaluationFrequency": "[parameters('evaluationFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria", + "allOf": [ + { + "name": "1st criterion", + "metricName": "pvUsageExceededPercentage", + "metricNamespace": "Insights.Container/persistentvolumes", + "dimensions": [ + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "[parameters('operator')]", + "threshold": "[parameters('threshold')]", + "timeAggregation": "[parameters('timeAggregation')]", + "skipMetricValidation": true + } + ] + }, + "actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]" + } + } + ] +} diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f02ec0131..e55c62fbc 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 9ada8425f..ba40b7a35 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..ca2538b79 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 1c01dd8c6..345c51633 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -12,6 +12,7 @@ @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD +@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -35,7 +36,7 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings def populateSettingValuesFromConfigMap(parsedConfig) if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil? - # Get mdm metrics config settings for resource utilization + # Get mdm metrics config settings for container resource utilization begin resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds] if !resourceUtilization.nil? @@ -66,7 +67,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - puts "config::Using config map settings for MDM metric configuration settings for resource utilization" + puts "config::Using config map settings for MDM metric configuration settings for container resource utilization" end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") @@ -74,6 +75,32 @@ def populateSettingValuesFromConfigMap(parsedConfig) @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end + + # Get mdm metrics config settings for PV utilization + begin + isUsingPVThresholdConfig = false + pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + if !pvUtilizationThresholds.nil? + pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] + if !pvUsageThreshold.nil? + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + isUsingPVThresholdConfig = true + end + end + end + + if isUsingPVThresholdConfig + puts "config::Using config map settings for MDM metric configuration settings for PV utilization" + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end end end @@ -97,6 +124,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb new file mode 100644 index 000000000..40d87b7f1 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -0,0 +1,71 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require_relative "microsoft/omsagent/plugin/constants" + +@configMapMountPath = "/etc/config/settings/metric_collection_settings" +@configVersion = "" +@configSchemaVersion = "" + +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectPVKubeSystemMetrics = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings +def populateSettingValuesFromConfigMap(parsedConfig) + # Get metric collection settings for including or excluding kube-system namespace in PV metrics + begin + if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Metric Collection Settings Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_metric_collection_env_var", "w") + +if !file.nil? + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") + # Close file after writing all metric collection setting environment variables + file.close + puts "****************End Metric Collection Settings Processing********************" +else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End Metric Collection Settings Processing********************" +end diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58e09f041..aec1bb456 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,6 +42,7 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected + prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings [prometheus_data_collection_settings.cluster] @@ -90,6 +91,15 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] @@ -100,6 +110,11 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 311470660..d9fdc42e9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do done source config_mdm_metrics_env_var +#Parse the configmap to set the right environment variables for metric collection settings +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + +cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc +done +source config_metric_collection_env_var + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index db788a37e..128f68697 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -125,7 +125,7 @@ data: type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 13796cd1e..7661bb7a1 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -20,6 +20,7 @@ class CAdvisorMetricsAPIClient @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @@ -53,6 +54,7 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} @@ -301,6 +303,8 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + + metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -311,6 +315,79 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end + def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 + + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + + podNamespace = pod["podRef"]["namespace"] + excludeNamespace = false + if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" + excludeNamespace = true + end + + if (!excludeNamespace && !pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + pvcName = pvcRef["name"] + pvcNamespace = pvcRef["namespace"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + + # If kube-system metrics collection enabled, send telemetry + begin + if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true" + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {}) + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}") + end + + return metricItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index 2e516a99d..d5107fea1 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -90,6 +90,38 @@ class MdmAlertTemplates } }' + PV_resource_utilization_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceUtilizationPercentage}, + "max": %{pvResourceUtilizationPercentage}, + "sum": %{pvResourceUtilizationPercentage}, + "count": 1 + } + ] + } + } + }' + + Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3d75dc6f4..1e7db37cc 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -37,6 +37,10 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, } + @@pod_metric_name_metric_percentage_name_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC + } + # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true @@ -259,6 +263,31 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) + records = [] + begin + containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] + podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] + podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] + + resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + pvResourceUtilizationPercentage: percentageMetricValue, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + rescue => errorStr + @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return records + end + def getDiskUsageMetricRecords(record) records = [] usedPercent = nil @@ -356,6 +385,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -375,6 +405,12 @@ def getContainerResourceUtilizationThresholds memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2) metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + + pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] + if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? + pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) + metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index dd1ba24b3..82a6e8814 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,12 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUid" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" #Kubestate (common) @@ -45,6 +51,7 @@ class Constants MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -56,9 +63,11 @@ class Constants CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" @@ -71,6 +80,8 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index fd43ef98b..3bc674ea8 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -46,11 +46,13 @@ def start @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i # These variables keep track if any resource utilization threshold exceeded in the last 10 minutes @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false # initialize cpu and memory limit if @process_incoming_stream @@ -60,6 +62,7 @@ def start @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} + @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds end rescue => e @@ -87,6 +90,8 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true + elsif metricName == Constants::PV_USED_BYTES + @pvExceededUsageThreshold = true end rescue => errorStr @log.info "Error in setThresholdExceededTelemetry: #{errorStr}" @@ -109,13 +114,30 @@ def flushMetricTelemetry properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) - @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # Also send for PV usage metrics + begin + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) + @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - @log.info "Error in flushMetricTelemetry: #{errorStr}" + @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -123,6 +145,13 @@ def flushMetricTelemetry def filter(tag, time, record) begin if @process_incoming_stream + + # Check if insights metrics for PV metrics + data_type = record["DataType"] + if data_type == "INSIGHTS_METRICS_BLOB" + return filterPVInsightsMetrics(record) + end + object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] percentage_metric_value = 0.0 @@ -204,6 +233,47 @@ def filter(tag, time, record) end end + def filterPVInsightsMetrics(record) + begin + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + rescue Exception => e + @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] #return empty array if we ran into any errors + end + end + def ensure_cpu_memory_capacity_set if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index a44365e9d..b706ff00a 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -88,6 +88,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 38868f2f5..4e90195e5 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -101,6 +101,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end