microsoft · rashmichandrashekar · Apr 21, 2021 · Apr 14, 2021 · Apr 15, 2021 · Apr 15, 2021
@@ -13,6 +13,7 @@
 @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
 @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
 @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
 
 # Use parser to parse the configmap toml file to a ruby structure
 def parseConfigMap
@@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig)
       ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors")
       @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
     end
+
+    # Get mdm metrics config settings for job completion
+    begin
+      jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold]
+      if !jobCompletion.nil?
+        jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes]
+        jobCompletionThresholdInt = jobCompletionThreshold.to_i
+        if jobCompletionThresholdInt.kind_of? Integer
+          @jobCompletionThresholdMinutes = jobCompletionThresholdInt
+        else
+          puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default "
+          @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
+        end
+        puts "config::Using config map settings for MDM metric configuration settings for job completion"
+      end
+    rescue => errorStr
+      ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors")
+      @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
+    end
   end
 end
 
@@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig)
   file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n")
   file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n")
   file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n")
+  file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n")
   # Close file after writing all MDM setting environment variables
   file.close
   puts "****************End MDM Metrics Config Processing********************"

@@ -126,6 +126,11 @@ data:
     [alertable_metrics_configuration_settings.pv_utilization_thresholds]
         # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
         pv_usage_threshold_percentage = 60.0
+
+    # Alertable metrics configuration settings for completed jobs count
+    [alertable_metrics_configuration_settings.job_completion_threshold]
+        # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold 
+        job_completion_threshold_time_minutes = 360
   integrations: |-
     [integrations.azure_network_policy_manager]
         collect_basic_metrics = false

@@ -31,6 +31,8 @@ class KubernetesApiClient
   @@TokenStr = nil
   @@NodeMetrics = Hash.new
   @@WinNodeArray = []
+  @@telemetryTimeTracker = DateTime.now.to_time.to_i
+  @@resourceLimitsTelemetryHash = {}
 
   def initialize
   end
@@ -403,9 +405,12 @@ def getPodUid(podNameSpace, podMetadata)
 
     def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
       metricItems = []
+      timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs
+      timeDifferenceInMinutes = timeDifference / 60
       begin
         clusterId = getClusterId
         podNameSpace = pod["metadata"]["namespace"]
+        podName = pod["metadata"]["name"]
         podUid = getPodUid(podNameSpace, pod["metadata"])
         if podUid.nil?
           return metricItems
@@ -456,6 +461,33 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               metricProps["Collections"].push(metricCollections)
               metricItem["DataItems"].push(metricProps)
               metricItems.push(metricItem)
+              #Telemetry about omsagent requests and limits
 - name: CONTAINER_CPU_LIMIT 
 - name: CONTAINER_CPU_LIMIT 
+              begin
+                if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent"))
+                  nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~")
+                  @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue
+                end
+                if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+                  @@resourceLimitsTelemetryHash.each { |key, value|
+                    keyElements = key.split("~~")
+                    if keyElements.length != 4
+                      next
+                    end
+
+                    # get dimension values by key
+                    telemetryProps = {}
+                    telemetryProps["Computer"] = keyElements[0]
+                    telemetryProps["PodName"] = keyElements[1]
+                    telemetryProps["ContainerName"] = keyElements[2]
+                    metricNameFromKey = keyElements[3]
+                    ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps)
+                  }
+                  @@telemetryTimeTracker = DateTime.now.to_time.to_i
+                  @@resourceLimitsTelemetryHash = {}
+                end
+              rescue => errorStr
+                $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}")
+              end
               #No container level limit for the given metric, so default to node level limit
             else
               nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
@@ -791,7 +823,7 @@ def getKubeAPIServerUrl
     def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
       kubeServiceRecords = []
       begin
-        if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? )
+        if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?)
           servicesCount = serviceList["items"].length
           @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList  #{servicesCount} @ #{Time.now.utc.iso8601}")
           serviceList["items"].each do |item|

@@ -45,7 +45,7 @@ class MdmAlertTemplates
                     "dimValues": [
                         "%{controllerNameDimValue}",
                         "%{namespaceDimValue}",
-                        "6"
+                        "%{jobCompletionThreshold}"
                     ],
                     "min": %{containerCountMetricValue},
                     "max": %{containerCountMetricValue},

@@ -107,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat
             podControllerNameDimValue = key_elements[0]
             podNamespaceDimValue = key_elements[1]
 
-            record = metricsTemplate % {
-              timestamp: batch_time,
-              metricName: metricName,
-              controllerNameDimValue: podControllerNameDimValue,
-              namespaceDimValue: podNamespaceDimValue,
-              containerCountMetricValue: value,
-            }
+            # Special handling for jobs since we need to send the threshold as a dimension as it is configurable
+            if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT
+              metric_threshold_hash = getContainerResourceUtilizationThresholds
+              #Converting this to hours since we already have olderThanHours dimension.
+              jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0
+              record = metricsTemplate % {
+                timestamp: batch_time,
+                metricName: metricName,
+                controllerNameDimValue: podControllerNameDimValue,
+                namespaceDimValue: podNamespaceDimValue,
+                containerCountMetricValue: value,
+                jobCompletionThreshold: jobCompletionThresholdHours,
+              }
+            else
+              record = metricsTemplate % {
+                timestamp: batch_time,
+                metricName: metricName,
+                controllerNameDimValue: podControllerNameDimValue,
+                namespaceDimValue: podNamespaceDimValue,
+                containerCountMetricValue: value,
+              }
+            end
             records.push(Yajl::Parser.parse(StringIO.new(record)))
           }
         else
@@ -140,9 +155,11 @@ def flushPodMdmMetricTelemetry
         staleJobHashValues = @stale_job_count_hash.values
         staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x }
 
+        metric_threshold_hash = getContainerResourceUtilizationThresholds
         properties["ContainerRestarts"] = containerRestartMetricCount
         properties["OomKilledContainers"] = oomKilledContainerMetricCount
         properties["OldCompletedJobs"] = staleJobMetricCount
+        properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
         ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties)
         ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {})
       rescue => errorStr
@@ -465,6 +482,7 @@ def getContainerResourceUtilizationThresholds
         metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
         metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
         metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+        metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES
 
         cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"]
         if !cpuThreshold.nil? && !cpuThreshold.empty?
@@ -490,6 +508,12 @@ def getContainerResourceUtilizationThresholds
           pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2)
           metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat
         end
+
+        jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"]
+        if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty?
+          jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i
+          metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt
+        end
       rescue => errorStr
         @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}"
         ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)

@@ -69,14 +69,15 @@ class Constants
   MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes"
   MEMORY_RSS_BYTES = "memoryRssBytes"
   PV_USED_BYTES = "pvUsedBytes"
+  JOB_COMPLETION_TIME = "completedJobTimeMinutes"
   DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
   DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
   DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
   DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
+  DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360
   CONTROLLER_KIND_JOB = "job"
   CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
   CONTAINER_STATE_TERMINATED = "terminated"
-  STALE_JOB_TIME_IN_MINUTES = 360
   TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
   OMSAGENT_ZERO_FILL = "omsagent"
   KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"

@@ -88,6 +88,7 @@ def initialize()
     @pod_count_by_phase = {}
     @pod_uids = {}
     @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability
+    @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds
     @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}"
     @log.debug { "Starting podinventory_to_mdm plugin" }
   end
@@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames
             if !containerFinishedTime.nil? && !containerFinishedTime.empty?
               finishedTimeParsed = Time.parse(containerFinishedTime)
               # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES
-              if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES
+              if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME]
                 MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue,
                                                                  podNamespaceDimValue)
               end