diff --git a/alerts/recommended_alerts_ARM/PVUsagePercentage.json b/alerts/recommended_alerts_ARM/PVUsagePercentage.json
new file mode 100644
index 000000000..e6cdbee15
--- /dev/null
+++ b/alerts/recommended_alerts_ARM/PVUsagePercentage.json
@@ -0,0 +1,174 @@
+{
+ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+ "contentVersion": "1.0.0.0",
+ "parameters": {
+ "alertName": {
+ "type": "string",
+ "minLength": 1,
+ "metadata": {
+ "description": "Name of the alert"
+ }
+ },
+ "alertDescription": {
+ "type": "string",
+ "defaultValue": "This is a metric alert",
+ "metadata": {
+ "description": "Description of alert"
+ }
+ },
+ "alertSeverity": {
+ "type": "int",
+ "defaultValue": 3,
+ "allowedValues": [
+ 0,
+ 1,
+ 2,
+ 3,
+ 4
+ ],
+ "metadata": {
+ "description": "Severity of alert {0,1,2,3,4}"
+ }
+ },
+ "isEnabled": {
+ "type": "bool",
+ "defaultValue": true,
+ "metadata": {
+ "description": "Specifies whether the alert is enabled"
+ }
+ },
+ "clusterResourceId": {
+ "type": "string",
+ "minLength": 1,
+ "metadata": {
+ "description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz"
+ }
+ },
+ "operator": {
+ "type": "string",
+ "defaultValue": "GreaterThan",
+ "allowedValues": [
+ "Equals",
+ "NotEquals",
+ "GreaterThan",
+ "GreaterThanOrEqual",
+ "LessThan",
+ "LessThanOrEqual"
+ ],
+ "metadata": {
+ "description": "Operator comparing the current value with the threshold value."
+ }
+ },
+ "threshold": {
+ "type": "int",
+ "defaultValue": 80,
+ "metadata": {
+ "description": "The threshold value at which the alert is activated."
+ },
+ "minValue": 1,
+ "maxValue": 100
+ },
+ "timeAggregation": {
+ "type": "string",
+ "defaultValue": "Average",
+ "allowedValues": [
+ "Average",
+ "Minimum",
+ "Maximum",
+ "Count"
+ ],
+ "metadata": {
+ "description": "How the data that is collected should be combined over time."
+ }
+ },
+ "windowSize": {
+ "type": "string",
+ "defaultValue": "PT5M",
+ "allowedValues": [
+ "PT1M",
+ "PT5M",
+ "PT15M",
+ "PT30M",
+ "PT1H",
+ "PT6H",
+ "PT12H",
+ "PT24H"
+ ],
+ "metadata": {
+ "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
+ }
+ },
+ "evaluationFrequency": {
+ "type": "string",
+ "defaultValue": "PT1M",
+ "allowedValues": [
+ "PT1M",
+ "PT5M",
+ "PT15M",
+ "PT30M",
+ "PT1H"
+ ],
+ "metadata": {
+ "description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
+ }
+ },
+ "actionGroupId": {
+ "type": "string",
+ "defaultValue": "",
+ "metadata": {
+ "description": "The ID of the action group that is triggered when the alert is activated or deactivated"
+ }
+ }
+ },
+ "variables": {},
+ "resources": [
+ {
+ "name": "[parameters('alertName')]",
+ "type": "Microsoft.Insights/metricAlerts",
+ "location": "global",
+ "apiVersion": "2018-03-01",
+ "tags": {},
+ "properties": {
+ "description": "[parameters('alertDescription')]",
+ "severity": "[parameters('alertSeverity')]",
+ "enabled": "[parameters('isEnabled')]",
+ "scopes": [
+ "[parameters('clusterResourceId')]"
+ ],
+ "evaluationFrequency": "[parameters('evaluationFrequency')]",
+ "windowSize": "[parameters('windowSize')]",
+ "criteria": {
+ "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria",
+ "allOf": [
+ {
+ "name": "1st criterion",
+ "metricName": "pvUsageExceededPercentage",
+ "metricNamespace": "Insights.Container/persistentvolumes",
+ "dimensions": [
+ {
+ "name": "kubernetesNamespace",
+ "operator": "Include",
+ "values": [
+ "*"
+ ]
+ },
+ {
+ "name": "podName",
+ "operator": "Include",
+ "values": [
+ "*"
+ ]
+ }
+ ],
+ "operator": "[parameters('operator')]",
+ "threshold": "[parameters('threshold')]",
+ "timeAggregation": "[parameters('timeAggregation')]",
+ "skipMetricValidation": true
+ }
+ ]
+ },
+ "actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]"
+ }
+ }
+ ]
+}
diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf
index f02ec0131..e55c62fbc 100644
--- a/build/linux/installer/conf/container.conf
+++ b/build/linux/installer/conf/container.conf
@@ -46,7 +46,7 @@
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
- metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes
+ metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes
log_level info
diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 9ada8425f..ba40b7a35 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -74,7 +74,7 @@
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
- metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes
+ metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
log_level info
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index 87b89b14c..ca2538b79 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root
/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root
/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root
+/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root
/opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root
/opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root
diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
index 1c01dd8c6..345c51633 100644
--- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
+++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
@@ -12,6 +12,7 @@
@percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD
@percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
+@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
# Use parser to parse the configmap toml file to a ruby structure
def parseConfigMap
@@ -35,7 +36,7 @@ def parseConfigMap
# Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings
def populateSettingValuesFromConfigMap(parsedConfig)
if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil?
- # Get mdm metrics config settings for resource utilization
+ # Get mdm metrics config settings for container resource utilization
begin
resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds]
if !resourceUtilization.nil?
@@ -66,7 +67,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default "
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
end
- puts "config::Using config map settings for MDM metric configuration settings for resource utilization"
+ puts "config::Using config map settings for MDM metric configuration settings for container resource utilization"
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors")
@@ -74,6 +75,32 @@ def populateSettingValuesFromConfigMap(parsedConfig)
@percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
end
+
+ # Get mdm metrics config settings for PV utilization
+ begin
+ isUsingPVThresholdConfig = false
+ pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds]
+ if !pvUtilizationThresholds.nil?
+ pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage]
+ if !pvUsageThreshold.nil?
+ pvUsageThresholdFloat = pvUsageThreshold.to_f
+ if pvUsageThresholdFloat.kind_of? Float
+ @percentagePVUsageThreshold = pvUsageThresholdFloat
+ isUsingPVThresholdConfig = true
+ end
+ end
+ end
+
+ if isUsingPVThresholdConfig
+ puts "config::Using config map settings for MDM metric configuration settings for PV utilization"
+ else
+ puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default "
+ @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+ end
+ rescue => errorStr
+ ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors")
+ @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+ end
end
end
@@ -97,6 +124,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n")
file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n")
file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n")
+ file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n")
# Close file after writing all MDM setting environment variables
file.close
puts "****************End MDM Metrics Config Processing********************"
diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb
new file mode 100644
index 000000000..40d87b7f1
--- /dev/null
+++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb
@@ -0,0 +1,71 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require_relative "tomlrb"
+require_relative "ConfigParseErrorLogger"
+require_relative "microsoft/omsagent/plugin/constants"
+
+@configMapMountPath = "/etc/config/settings/metric_collection_settings"
+@configVersion = ""
+@configSchemaVersion = ""
+
+# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist
+@collectPVKubeSystemMetrics = false
+
+# Use parser to parse the configmap toml file to a ruby structure
+def parseConfigMap
+ begin
+ # Check to see if config map is created
+ if (File.file?(@configMapMountPath))
+ puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values"
+ parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true)
+ puts "config::Successfully parsed mounted config map"
+ return parsedConfig
+ else
+ puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults"
+ return nil
+ end
+ rescue => errorStr
+ ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors")
+ return nil
+ end
+end
+
+# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings
+def populateSettingValuesFromConfigMap(parsedConfig)
+ # Get metric collection settings for including or excluding kube-system namespace in PV metrics
+ begin
+ if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil?
+ @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled]
+ puts "config::Using config map setting for PV kube-system collection"
+ end
+ rescue => errorStr
+ ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors")
+ end
+end
+
+@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"]
+puts "****************Start Metric Collection Settings Processing********************"
+if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it
+ configMapSettings = parseConfigMap
+ if !configMapSettings.nil?
+ populateSettingValuesFromConfigMap(configMapSettings)
+ end
+else
+ if (File.file?(@configMapMountPath))
+ ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version")
+ end
+end
+
+# Write the settings to file, so that they can be set as environment variables
+file = File.open("config_metric_collection_env_var", "w")
+
+if !file.nil?
+ file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n")
+ # Close file after writing all metric collection setting environment variables
+ file.close
+ puts "****************End Metric Collection Settings Processing********************"
+else
+ puts "Exception while opening file for writing MDM metric config environment variables"
+ puts "****************End Metric Collection Settings Processing********************"
+end
diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml
index 58e09f041..aec1bb456 100644
--- a/kubernetes/container-azm-ms-agentconfig.yaml
+++ b/kubernetes/container-azm-ms-agentconfig.yaml
@@ -42,6 +42,7 @@ data:
# When the setting is set to false, only the kube events with !normal event type will be collected
enabled = false
# When this is enabled (enabled = true), all kube events including normal events will be collected
+
prometheus-data-collection-settings: |-
# Custom Prometheus metrics data collection settings
[prometheus_data_collection_settings.cluster]
@@ -90,6 +91,15 @@ data:
#fieldpass = ["metric_to_pass1", "metric_to_pass12"]
#fielddrop = ["metric_to_drop"]
+
+ metric_collection_settings: |-
+ # Metrics collection settings for metrics sent to Log Analytics and MDM
+ [metric_collection_settings.collect_kube_system_pv_metrics]
+ # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false
+ # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected
+ enabled = false
+ # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected
+
alertable-metrics-configuration-settings: |-
# Alertable metrics configuration settings for container resource utilization
[alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
@@ -100,6 +110,11 @@ data:
container_memory_rss_threshold_percentage = 95.0
# Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage
container_memory_working_set_threshold_percentage = 95.0
+
+ # Alertable metrics configuration settings for persistent volume utilization
+ [alertable_metrics_configuration_settings.pv_utilization_thresholds]
+ # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
+ pv_usage_threshold_percentage = 60.0
integrations: |-
[integrations.azure_network_policy_manager]
collect_basic_metrics = false
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 311470660..d9fdc42e9 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do
done
source config_mdm_metrics_env_var
+#Parse the configmap to set the right environment variables for metric collection settings
+/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb
+
+cat config_metric_collection_env_var | while read line; do
+ echo $line >> ~/.bashrc
+done
+source config_metric_collection_env_var
+
#Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request
echo "Making wget request to cadvisor endpoint with port 10250"
#Defaults to use port 10255
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index db788a37e..128f68697 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -125,7 +125,7 @@ data:
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
- metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes
+ metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
log_level info
diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb
index 13796cd1e..7661bb7a1 100644
--- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb
+++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb
@@ -20,6 +20,7 @@ class CAdvisorMetricsAPIClient
@clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
@clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"]
@clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"]
+ @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"]
@clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"]
@clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"]
@clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"]
@@ -53,6 +54,7 @@ class CAdvisorMetricsAPIClient
@@winNodePrevMetricRate = {}
@@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i
@@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i
+ @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i
#Containers a hash of node name and the last time telemetry was sent for this node
@@nodeTelemetryTimeTracker = {}
@@ -301,6 +303,8 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601)
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime))
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime))
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime))
+
+ metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime))
else
@Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}")
end
@@ -311,6 +315,79 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601)
return metricDataItems
end
+ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime)
+ telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs
+ telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60
+
+ metricItems = []
+ clusterId = KubernetesApiClient.getClusterId
+ clusterName = KubernetesApiClient.getClusterName
+ begin
+ metricInfo = metricJSON
+ metricInfo["pods"].each do |pod|
+
+ podNamespace = pod["podRef"]["namespace"]
+ excludeNamespace = false
+ if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false"
+ excludeNamespace = true
+ end
+
+ if (!excludeNamespace && !pod["volume"].nil?)
+ pod["volume"].each do |volume|
+ if (!volume["pvcRef"].nil?)
+ pvcRef = volume["pvcRef"]
+ if (!pvcRef["name"].nil?)
+
+ # A PVC exists on this volume
+ podUid = pod["podRef"]["uid"]
+ podName = pod["podRef"]["name"]
+ pvcName = pvcRef["name"]
+ pvcNamespace = pvcRef["namespace"]
+
+ metricItem = {}
+ metricItem["CollectionTime"] = metricPollTime
+ metricItem["Computer"] = hostName
+ metricItem["Name"] = metricNameToReturn
+ metricItem["Value"] = volume[metricNameToCollect]
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"]
+
+ metricItem["Tags"] = metricTags
+
+ metricItems.push(metricItem)
+ end
+ end
+ end
+ end
+ end
+ rescue => errorStr
+ @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}")
+ return metricItems
+ end
+
+ # If kube-system metrics collection enabled, send telemetry
+ begin
+ if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true"
+ ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {})
+ @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i
+ end
+ rescue => errorStr
+ @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}")
+ end
+
+ return metricItems
+ end
+
+
def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime)
metricItems = []
clusterId = KubernetesApiClient.getClusterId
diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb
index 2e516a99d..d5107fea1 100644
--- a/source/plugins/ruby/MdmAlertTemplates.rb
+++ b/source/plugins/ruby/MdmAlertTemplates.rb
@@ -90,6 +90,38 @@ class MdmAlertTemplates
}
}'
+ PV_resource_utilization_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/persistentvolumes",
+ "dimNames": [
+ "podName",
+ "node",
+ "kubernetesNamespace",
+ "thresholdPercentage"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{podNameDimValue}",
+ "%{computerNameDimValue}",
+ "%{namespaceDimValue}",
+ "%{thresholdPercentageDimValue}"
+ ],
+ "min": %{pvResourceUtilizationPercentage},
+ "max": %{pvResourceUtilizationPercentage},
+ "sum": %{pvResourceUtilizationPercentage},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+
Node_resource_metrics_template = '
{
"time": "%{timestamp}",
diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb
index 3d75dc6f4..1e7db37cc 100644
--- a/source/plugins/ruby/MdmMetricsGenerator.rb
+++ b/source/plugins/ruby/MdmMetricsGenerator.rb
@@ -37,6 +37,10 @@ class MdmMetricsGenerator
Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC,
}
+ @@pod_metric_name_metric_percentage_name_hash = {
+ Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC
+ }
+
# Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails
@sendZeroFilledMetrics = true
@@ -259,6 +263,31 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag
return records
end
+ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage)
+ records = []
+ begin
+ containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME]
+ pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE]
+ podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME]
+ podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID]
+
+ resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % {
+ timestamp: recordTimeStamp,
+ metricName: @@pod_metric_name_metric_percentage_name_hash[metricName],
+ podNameDimValue: podName,
+ computerNameDimValue: computer,
+ namespaceDimValue: pvcNamespace,
+ pvResourceUtilizationPercentage: percentageMetricValue,
+ thresholdPercentageDimValue: thresholdPercentage,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord)))
+ rescue => errorStr
+ @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
def getDiskUsageMetricRecords(record)
records = []
usedPercent = nil
@@ -356,6 +385,7 @@ def getContainerResourceUtilizationThresholds
metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD
metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
+ metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"]
if !cpuThreshold.nil? && !cpuThreshold.empty?
@@ -375,6 +405,12 @@ def getContainerResourceUtilizationThresholds
memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2)
metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat
end
+
+ pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"]
+ if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty?
+ pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2)
+ metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat
+ end
rescue => errorStr
@log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}"
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index dd1ba24b3..82a6e8814 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -13,6 +13,12 @@ class Constants
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
+ INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
+ INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
+ INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
+ INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
+ INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
+ INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
REASON_OOM_KILLED = "oomkilled"
#Kubestate (common)
@@ -45,6 +51,7 @@ class Constants
MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
+ MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"
@@ -56,9 +63,11 @@ class Constants
CPU_USAGE_MILLI_CORES = "cpuUsageMillicores"
MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes"
MEMORY_RSS_BYTES = "memoryRssBytes"
+ PV_USED_BYTES = "pvUsedBytes"
DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
+ DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
CONTROLLER_KIND_JOB = "job"
CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
CONTAINER_STATE_TERMINATED = "terminated"
@@ -71,6 +80,8 @@ class Constants
CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent"
POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent"
CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent"
+ PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent"
+ PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled"
TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10
KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"
diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb
index fd43ef98b..3bc674ea8 100644
--- a/source/plugins/ruby/filter_cadvisor2mdm.rb
+++ b/source/plugins/ruby/filter_cadvisor2mdm.rb
@@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter
config_param :enable_log, :integer, :default => 0
config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log"
config_param :custom_metrics_azure_regions, :string
- config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES"
+ config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES"
@@hostName = (OMS::Common.get_hostname)
@@ -46,11 +46,13 @@ def start
@metrics_to_collect_hash = build_metrics_hash
@log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}"
@@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i
+ @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i
# These variables keep track if any resource utilization threshold exceeded in the last 10 minutes
@containersExceededCpuThreshold = false
@containersExceededMemRssThreshold = false
@containersExceededMemWorkingSetThreshold = false
+ @pvExceededUsageThreshold = false
# initialize cpu and memory limit
if @process_incoming_stream
@@ -60,6 +62,7 @@ def start
@containerCpuLimitHash = {}
@containerMemoryLimitHash = {}
@containerResourceDimensionHash = {}
+ @pvUsageHash = {}
@@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds
end
rescue => e
@@ -87,6 +90,8 @@ def setThresholdExceededTelemetry(metricName)
@containersExceededMemRssThreshold = true
elsif metricName == Constants::MEMORY_WORKING_SET_BYTES
@containersExceededMemWorkingSetThreshold = true
+ elsif metricName == Constants::PV_USED_BYTES
+ @pvExceededUsageThreshold = true
end
rescue => errorStr
@log.info "Error in setThresholdExceededTelemetry: #{errorStr}"
@@ -109,13 +114,30 @@ def flushMetricTelemetry
properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold
properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold
ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties)
- @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i
@containersExceededCpuThreshold = false
@containersExceededMemRssThreshold = false
@containersExceededMemWorkingSetThreshold = false
+ @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i
+ end
+ rescue => errorStr
+ @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+
+ # Also send for PV usage metrics
+ begin
+ pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs
+ pvTimeDifferenceInMinutes = pvTimeDifference / 60
+ if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+ pvProperties = {}
+ pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES]
+ pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold
+ ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties)
+ @pvExceededUsageThreshold = false
+ @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i
end
rescue => errorStr
- @log.info "Error in flushMetricTelemetry: #{errorStr}"
+ @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry"
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
end
end
@@ -123,6 +145,13 @@ def flushMetricTelemetry
def filter(tag, time, record)
begin
if @process_incoming_stream
+
+ # Check if insights metrics for PV metrics
+ data_type = record["DataType"]
+ if data_type == "INSIGHTS_METRICS_BLOB"
+ return filterPVInsightsMetrics(record)
+ end
+
object_name = record["DataItems"][0]["ObjectName"]
counter_name = record["DataItems"][0]["Collections"][0]["CounterName"]
percentage_metric_value = 0.0
@@ -204,6 +233,47 @@ def filter(tag, time, record)
end
end
+ def filterPVInsightsMetrics(record)
+ begin
+ mdmMetrics = []
+ record["DataItems"].each do |dataItem|
+
+ if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase)
+ metricName = dataItem["Name"]
+ usage = dataItem["Value"]
+ capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES]
+ if capacity != 0
+ percentage_metric_value = (usage * 100.0) / capacity
+ end
+ @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}"
+ @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}"
+
+ computer = dataItem["Computer"]
+ resourceDimensions = dataItem["Tags"]
+ thresholdPercentage = @@metric_threshold_hash[metricName]
+
+ flushMetricTelemetry
+ if percentage_metric_value >= thresholdPercentage
+ setThresholdExceededTelemetry(metricName)
+ return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"],
+ metricName,
+ computer,
+ percentage_metric_value,
+ resourceDimensions,
+ thresholdPercentage)
+ else
+ return []
+ end # end if block for percentage metric > configured threshold % check
+ end # end if block for dataItem name check
+ end # end for block of looping through data items
+ return []
+ rescue Exception => e
+ @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace)
+ return [] #return empty array if we ran into any errors
+ end
+ end
+
def ensure_cpu_memory_capacity_set
if @cpu_capacity != 0.0 && @memory_capacity != 0.0
@log.info "CPU And Memory Capacity are already set"
diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb
index a44365e9d..b706ff00a 100644
--- a/source/plugins/ruby/in_cadvisor_perf.rb
+++ b/source/plugins/ruby/in_cadvisor_perf.rb
@@ -88,6 +88,7 @@ def enumerate()
end
router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
$log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb
index 38868f2f5..4e90195e5 100644
--- a/source/plugins/ruby/in_win_cadvisor_perf.rb
+++ b/source/plugins/ruby/in_win_cadvisor_perf.rb
@@ -101,6 +101,7 @@ def enumerate()
end
router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
$log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end