Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
@percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
@percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES

# Use parser to parse the configmap toml file to a ruby structure
def parseConfigMap
Expand Down Expand Up @@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig)
ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors")
@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
end

# Get mdm metrics config settings for job completion
begin
jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold]
if !jobCompletion.nil?
jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes]
jobCompletionThresholdInt = jobCompletionThreshold.to_i
if jobCompletionThresholdInt.kind_of? Integer
@jobCompletionThresholdMinutes = jobCompletionThresholdInt
else
puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default "
@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
end
puts "config::Using config map settings for MDM metric configuration settings for job completion"
end
rescue => errorStr
ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors")
@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
end
end
end

Expand All @@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n")
file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n")
file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n")
file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n")
# Close file after writing all MDM setting environment variables
file.close
puts "****************End MDM Metrics Config Processing********************"
Expand Down
5 changes: 5 additions & 0 deletions kubernetes/container-azm-ms-agentconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ data:
[alertable_metrics_configuration_settings.pv_utilization_thresholds]
# Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
pv_usage_threshold_percentage = 60.0

# Alertable metrics configuration settings for completed jobs count
[alertable_metrics_configuration_settings.job_completion_threshold]
# Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold
job_completion_threshold_time_minutes = 360
integrations: |-
[integrations.azure_network_policy_manager]
collect_basic_metrics = false
Expand Down
34 changes: 33 additions & 1 deletion source/plugins/ruby/KubernetesApiClient.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class KubernetesApiClient
@@TokenStr = nil
@@NodeMetrics = Hash.new
@@WinNodeArray = []
@@telemetryTimeTracker = DateTime.now.to_time.to_i
@@resourceLimitsTelemetryHash = {}

def initialize
end
Expand Down Expand Up @@ -403,9 +405,12 @@ def getPodUid(podNameSpace, podMetadata)

def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
metricItems = []
timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs
timeDifferenceInMinutes = timeDifference / 60
begin
clusterId = getClusterId
podNameSpace = pod["metadata"]["namespace"]
podName = pod["metadata"]["name"]
podUid = getPodUid(podNameSpace, pod["metadata"])
if podUid.nil?
return metricItems
Expand Down Expand Up @@ -456,6 +461,33 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
metricProps["Collections"].push(metricCollections)
metricItem["DataItems"].push(metricProps)
metricItems.push(metricItem)
#Telemetry about omsagent requests and limits
Copy link
Member

@vishiy vishiy Apr 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for adding this. This works and also you are not adding additional calls. For future reference, you can easily pass this to us thru downward API. Please see here for reference -

begin
if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent"))
nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~")
@@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue
end
if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
@@resourceLimitsTelemetryHash.each { |key, value|
keyElements = key.split("~~")
if keyElements.length != 4
next
end

# get dimension values by key
telemetryProps = {}
telemetryProps["Computer"] = keyElements[0]
telemetryProps["PodName"] = keyElements[1]
telemetryProps["ContainerName"] = keyElements[2]
metricNameFromKey = keyElements[3]
ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps)
}
@@telemetryTimeTracker = DateTime.now.to_time.to_i
@@resourceLimitsTelemetryHash = {}
end
rescue => errorStr
$log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}")
end
#No container level limit for the given metric, so default to node level limit
else
nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
Expand Down Expand Up @@ -791,7 +823,7 @@ def getKubeAPIServerUrl
def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
kubeServiceRecords = []
begin
if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? )
if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?)
servicesCount = serviceList["items"].length
@Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}")
serviceList["items"].each do |item|
Expand Down
2 changes: 1 addition & 1 deletion source/plugins/ruby/MdmAlertTemplates.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class MdmAlertTemplates
"dimValues": [
"%{controllerNameDimValue}",
"%{namespaceDimValue}",
"6"
"%{jobCompletionThreshold}"
],
"min": %{containerCountMetricValue},
"max": %{containerCountMetricValue},
Expand Down
38 changes: 31 additions & 7 deletions source/plugins/ruby/MdmMetricsGenerator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
podControllerNameDimValue = key_elements[0]
podNamespaceDimValue = key_elements[1]

record = metricsTemplate % {
timestamp: batch_time,
metricName: metricName,
controllerNameDimValue: podControllerNameDimValue,
namespaceDimValue: podNamespaceDimValue,
containerCountMetricValue: value,
}
# Special handling for jobs since we need to send the threshold as a dimension as it is configurable
if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
metric_threshold_hash = getContainerResourceUtilizationThresholds
#Converting this to hours since we already have olderThanHours dimension.
jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0
record = metricsTemplate % {
timestamp: batch_time,
metricName: metricName,
controllerNameDimValue: podControllerNameDimValue,
namespaceDimValue: podNamespaceDimValue,
containerCountMetricValue: value,
jobCompletionThreshold: jobCompletionThresholdHours,
}
else
record = metricsTemplate % {
timestamp: batch_time,
metricName: metricName,
controllerNameDimValue: podControllerNameDimValue,
namespaceDimValue: podNamespaceDimValue,
containerCountMetricValue: value,
}
end
records.push(Yajl::Parser.parse(StringIO.new(record)))
}
else
Expand All @@ -140,9 +155,11 @@ def flushPodMdmMetricTelemetry
staleJobHashValues = @stale_job_count_hash.values
staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x }

metric_threshold_hash = getContainerResourceUtilizationThresholds
properties["ContainerRestarts"] = containerRestartMetricCount
properties["OomKilledContainers"] = oomKilledContainerMetricCount
properties["OldCompletedJobs"] = staleJobMetricCount
properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties)
ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {})
rescue => errorStr
Expand Down Expand Up @@ -465,6 +482,7 @@ def getContainerResourceUtilizationThresholds
metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES

cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"]
if !cpuThreshold.nil? && !cpuThreshold.empty?
Expand All @@ -490,6 +508,12 @@ def getContainerResourceUtilizationThresholds
pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2)
metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat
end

jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"]
if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty?
jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i
metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt
end
rescue => errorStr
@log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}"
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
Expand Down
3 changes: 2 additions & 1 deletion source/plugins/ruby/constants.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,15 @@ class Constants
MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes"
MEMORY_RSS_BYTES = "memoryRssBytes"
PV_USED_BYTES = "pvUsedBytes"
JOB_COMPLETION_TIME = "completedJobTimeMinutes"
DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360
CONTROLLER_KIND_JOB = "job"
CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
CONTAINER_STATE_TERMINATED = "terminated"
STALE_JOB_TIME_IN_MINUTES = 360
TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
OMSAGENT_ZERO_FILL = "omsagent"
KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"
Expand Down
3 changes: 2 additions & 1 deletion source/plugins/ruby/podinventory_to_mdm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def initialize()
@pod_count_by_phase = {}
@pod_uids = {}
@process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability
@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds
@log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}"
@log.debug { "Starting podinventory_to_mdm plugin" }
end
Expand Down Expand Up @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames
if !containerFinishedTime.nil? && !containerFinishedTime.empty?
finishedTimeParsed = Time.parse(containerFinishedTime)
# Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES
if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES
if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue,
podNamespaceDimValue)
end
Expand Down