diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 345c51633..5ce5d79d2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -13,6 +13,7 @@ @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD +@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end + + # Get mdm metrics config settings for job completion + begin + jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] + if !jobCompletion.nil? + jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] + jobCompletionThresholdInt = jobCompletionThreshold.to_i + if jobCompletionThresholdInt.kind_of? Integer + @jobCompletionThresholdMinutes = jobCompletionThresholdInt + else + puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default " + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end + puts "config::Using config map settings for MDM metric configuration settings for job completion" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors") + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end end end @@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index e38d9b4ab..543f270c1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -126,6 +126,11 @@ data: [alertable_metrics_configuration_settings.pv_utilization_thresholds] # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index c5a363741..98347d272 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -31,6 +31,8 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} def initialize end @@ -403,9 +405,12 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] + podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -456,6 +461,33 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + #Telemetry about omsagent requests and limits + begin + if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) + nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") + @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue + end + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@resourceLimitsTelemetryHash.each { |key, value| + keyElements = key.split("~~") + if keyElements.length != 4 + next + end + + # get dimension values by key + telemetryProps = {} + telemetryProps["Computer"] = keyElements[0] + telemetryProps["PodName"] = keyElements[1] + telemetryProps["ContainerName"] = keyElements[2] + metricNameFromKey = keyElements[3] + ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) + } + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") + end #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect @@ -791,7 +823,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index f2b713ff6..e889c3f09 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -45,7 +45,7 @@ class MdmAlertTemplates "dimValues": [ "%{controllerNameDimValue}", "%{namespaceDimValue}", - "6" + "%{jobCompletionThreshold}" ], "min": %{containerCountMetricValue}, "max": %{containerCountMetricValue}, diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 8703f43a7..f2aa92c14 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -107,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podControllerNameDimValue = key_elements[0] podNamespaceDimValue = key_elements[1] - record = metricsTemplate % { - timestamp: batch_time, - metricName: metricName, - controllerNameDimValue: podControllerNameDimValue, - namespaceDimValue: podNamespaceDimValue, - containerCountMetricValue: value, - } + # Special handling for jobs since we need to send the threshold as a dimension as it is configurable + if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + metric_threshold_hash = getContainerResourceUtilizationThresholds + #Converting this to hours since we already have olderThanHours dimension. + jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0 + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + jobCompletionThreshold: jobCompletionThresholdHours, + } + else + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + } + end records.push(Yajl::Parser.parse(StringIO.new(record))) } else @@ -140,9 +155,11 @@ def flushPodMdmMetricTelemetry staleJobHashValues = @stale_job_count_hash.values staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x } + metric_threshold_hash = getContainerResourceUtilizationThresholds properties["ContainerRestarts"] = containerRestartMetricCount properties["OomKilledContainers"] = oomKilledContainerMetricCount properties["OldCompletedJobs"] = staleJobMetricCount + properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties) ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {}) rescue => errorStr @@ -465,6 +482,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -490,6 +508,12 @@ def getContainerResourceUtilizationThresholds pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end + + jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"] + if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty? + jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index e0b0d1e0c..906019b95 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -69,14 +69,15 @@ class Constants MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" PV_USED_BYTES = "pvUsedBytes" + JOB_COMPLETION_TIME = "completedJobTimeMinutes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 77370e284..d9cb71bd4 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -88,6 +88,7 @@ def initialize() @pod_count_by_phase = {} @pod_uids = {} @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability + @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames if !containerFinishedTime.nil? && !containerFinishedTime.empty? finishedTimeParsed = Time.parse(containerFinishedTime) # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES - if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES + if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME] MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue, podNamespaceDimValue) end