Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
04826d0
Add in pv metrics from cadvisor
Aug 18, 2020
e0fbdef
Merge branch 'ci_dev' into grwehner/pv
Aug 19, 2020
a459794
changed to send only pv usage & add kube-system toggle config
Aug 20, 2020
0ec8ef9
variable name fixes
Aug 21, 2020
fb8a214
Added kube-system config
Aug 24, 2020
0b2f9dc
mdm filter
Aug 24, 2020
1bad74f
add pv_used_bytes to mdm filter metrics conf
Aug 24, 2020
7068629
filter fixes
Aug 24, 2020
94348cd
more filter fixes
Aug 24, 2020
58230fd
end statement fix
Aug 24, 2020
f68c04a
log fixes
Aug 25, 2020
46c1b50
all pv records to mdm
Aug 25, 2020
db24b0f
different mdm generator method
Aug 25, 2020
9d6874f
out_mdm log path
Aug 26, 2020
cdf96a0
try to get out_mdm logging path
Aug 26, 2020
c902df6
pv metric now sending to ME
Aug 26, 2020
0f41269
add in threshold condition
Aug 27, 2020
d4148cc
constants and consistent naming
Aug 27, 2020
1d6cee6
comments and code cleanup
Aug 27, 2020
357914a
remove container name, add pod name/uid
Aug 31, 2020
9377262
log fixes and constnat change
Aug 31, 2020
ee14b2b
naming fix
Aug 31, 2020
c1d46e8
cleanup
Aug 31, 2020
130e5d7
add pvUsedBytes as metric to collect
gracewehner Aug 31, 2020
d0f8d58
more cleanup
Aug 31, 2020
7cce941
Merge branch 'grwehner/pv' of https://github.com/microsoft/Docker-Pro…
Aug 31, 2020
0e7593b
Merge remote-tracking branch 'origin/ci_dev' into grwehner/pv
Aug 31, 2020
f0885e4
boolean fix
Aug 31, 2020
62b84ba
set threshold to 60
gracewehner Sep 1, 2020
8fca127
add check that pvUsedBytes is a configured metric to collect
gracewehner Sep 3, 2020
da0a34d
code review feedback changes
gracewehner Sep 4, 2020
68404bf
after testing changes
gracewehner Sep 4, 2020
4505b45
whitespace fix
gracewehner Sep 4, 2020
c08054b
variable name fix
gracewehner Sep 4, 2020
c88b9ab
naming changes
gracewehner Sep 8, 2020
668a691
match inventory schema naming
gracewehner Sep 21, 2020
bf57879
last naming fixes
gracewehner Sep 21, 2020
a01af6f
change podUID to podUid to match KubePodInventory
gracewehner Sep 21, 2020
b9b79c8
pv alert template
gracewehner Sep 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions alerts/recommended_alerts_ARM/PVUsagePercentage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "This is a metric alert",
"metadata": {
"description": "Description of alert"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"clusterResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"NotEquals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "int",
"defaultValue": 80,
"metadata": {
"description": "The threshold value at which the alert is activated."
},
"minValue": 1,
"maxValue": 100
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"actionGroupId": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "The ID of the action group that is triggered when the alert is activated or deactivated"
}
}
},
"variables": {},
"resources": [
{
"name": "[parameters('alertName')]",
"type": "Microsoft.Insights/metricAlerts",
"location": "global",
"apiVersion": "2018-03-01",
"tags": {},
"properties": {
"description": "[parameters('alertDescription')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"scopes": [
"[parameters('clusterResourceId')]"
],
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "pvUsageExceededPercentage",
"metricNamespace": "Insights.Container/persistentvolumes",
"dimensions": [
{
"name": "kubernetesNamespace",
"operator": "Include",
"values": [
"*"
]
},
{
"name": "podName",
"operator": "Include",
"values": [
"*"
]
}
],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"skipMetricValidation": true
}
]
},
"actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]"
}
}
]
}
2 changes: 1 addition & 1 deletion build/linux/installer/conf/container.conf
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
<filter mdm.cadvisorperf**>
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes
log_level info
</filter>

Expand Down
2 changes: 1 addition & 1 deletion build/linux/installer/conf/kube.conf
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
<filter mdm.cadvisorperf**>
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
log_level info
</filter>

Expand Down
1 change: 1 addition & 0 deletions build/linux/installer/datafiles/base_container.data
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root
/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root
/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root
/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root

/opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root
/opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root
Expand Down
32 changes: 30 additions & 2 deletions build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
@percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD
@percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD

# Use parser to parse the configmap toml file to a ruby structure
def parseConfigMap
Expand All @@ -35,7 +36,7 @@ def parseConfigMap
# Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings
def populateSettingValuesFromConfigMap(parsedConfig)
if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil?
# Get mdm metrics config settings for resource utilization
# Get mdm metrics config settings for container resource utilization
begin
resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds]
if !resourceUtilization.nil?
Expand Down Expand Up @@ -66,14 +67,40 @@ def populateSettingValuesFromConfigMap(parsedConfig)
puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default "
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
end
puts "config::Using config map settings for MDM metric configuration settings for resource utilization"
puts "config::Using config map settings for MDM metric configuration settings for container resource utilization"
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors")
@percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD
@percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
end

# Get mdm metrics config settings for PV utilization
begin
isUsingPVThresholdConfig = false
pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds]
if !pvUtilizationThresholds.nil?
pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage]
if !pvUsageThreshold.nil?
pvUsageThresholdFloat = pvUsageThreshold.to_f
if pvUsageThresholdFloat.kind_of? Float
@percentagePVUsageThreshold = pvUsageThresholdFloat
isUsingPVThresholdConfig = true
end
end
end

if isUsingPVThresholdConfig
puts "config::Using config map settings for MDM metric configuration settings for PV utilization"
else
puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default "
@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors")
@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
end
end
end

Expand All @@ -97,6 +124,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n")
file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n")
file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n")
file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n")
# Close file after writing all MDM setting environment variables
file.close
puts "****************End MDM Metrics Config Processing********************"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/local/bin/ruby
# frozen_string_literal: true

require_relative "tomlrb"
require_relative "ConfigParseErrorLogger"
require_relative "microsoft/omsagent/plugin/constants"

@configMapMountPath = "/etc/config/settings/metric_collection_settings"
@configVersion = ""
@configSchemaVersion = ""

# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist
@collectPVKubeSystemMetrics = false

# Use parser to parse the configmap toml file to a ruby structure
def parseConfigMap
begin
# Check to see if config map is created
if (File.file?(@configMapMountPath))
puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values"
parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true)
puts "config::Successfully parsed mounted config map"
return parsedConfig
else
puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults"
return nil
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors")
return nil
end
end

# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings
def populateSettingValuesFromConfigMap(parsedConfig)
# Get metric collection settings for including or excluding kube-system namespace in PV metrics
begin
if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil?
@collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled]
puts "config::Using config map setting for PV kube-system collection"
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors")
end
end

@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"]
puts "****************Start Metric Collection Settings Processing********************"
if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it
configMapSettings = parseConfigMap
if !configMapSettings.nil?
populateSettingValuesFromConfigMap(configMapSettings)
end
else
if (File.file?(@configMapMountPath))
ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version")
end
end

# Write the settings to file, so that they can be set as environment variables
file = File.open("config_metric_collection_env_var", "w")

if !file.nil?
file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n")
# Close file after writing all metric collection setting environment variables
file.close
puts "****************End Metric Collection Settings Processing********************"
else
puts "Exception while opening file for writing MDM metric config environment variables"
puts "****************End Metric Collection Settings Processing********************"
end
15 changes: 15 additions & 0 deletions kubernetes/container-azm-ms-agentconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ data:
# When the setting is set to false, only the kube events with !normal event type will be collected
enabled = false
# When this is enabled (enabled = true), all kube events including normal events will be collected

prometheus-data-collection-settings: |-
# Custom Prometheus metrics data collection settings
[prometheus_data_collection_settings.cluster]
Expand Down Expand Up @@ -90,6 +91,15 @@ data:
#fieldpass = ["metric_to_pass1", "metric_to_pass12"]

#fielddrop = ["metric_to_drop"]

metric_collection_settings: |-
# Metrics collection settings for metrics sent to Log Analytics and MDM
[metric_collection_settings.collect_kube_system_pv_metrics]
# In the absense of this configmap, default value for collect_kube_system_pv_metrics is false
# When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected
enabled = false
# When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected

alertable-metrics-configuration-settings: |-
# Alertable metrics configuration settings for container resource utilization
[alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
Expand All @@ -100,6 +110,11 @@ data:
container_memory_rss_threshold_percentage = 95.0
# Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage
container_memory_working_set_threshold_percentage = 95.0

# Alertable metrics configuration settings for persistent volume utilization
[alertable_metrics_configuration_settings.pv_utilization_thresholds]
# Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
pv_usage_threshold_percentage = 60.0
integrations: |-
[integrations.azure_network_policy_manager]
collect_basic_metrics = false
Expand Down
8 changes: 8 additions & 0 deletions kubernetes/linux/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do
done
source config_mdm_metrics_env_var

#Parse the configmap to set the right environment variables for metric collection settings
/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb

cat config_metric_collection_env_var | while read line; do
echo $line >> ~/.bashrc
done
source config_metric_collection_env_var

#Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request
echo "Making wget request to cadvisor endpoint with port 10250"
#Defaults to use port 10255
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/omsagent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ data:
<filter mdm.cadvisorperf**>
type filter_cadvisor2mdm
custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
log_level info
</filter>

Expand Down
Loading