From df7ee0ad95b435f834b7ed5615961f848d7e56b9 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Tue, 5 Nov 2019 15:31:50 -0800 Subject: [PATCH] Fix the issue where the health tree is inconsistent if a deployment is deleted --- .../health_container_cpu_memory_aggregator.rb | 92 ++++++++++++++++++- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index f6b57e0ae..6d69e0213 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -49,6 +49,9 @@ class HealthContainerCpuMemoryAggregator @@limit_is_array_event_sent = {} @@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT = "WorkloadContainerCountEmptyEvent" @@LIMIT_IS_ARRAY_EVENT = "ResourceLimitIsAnArrayEvent" + @@cpu_last_sent_monitors = {} + @@memory_last_sent_monitors = {} + def initialize(resources, provider) @pod_uid_lookup = resources.get_pod_uid_lookup @workload_container_count = resources.get_workload_container_count @@ -137,7 +140,6 @@ def aggregate(container_records) end container_instance_record = {} - pod_name = @pod_uid_lookup[lookup_key]["pod_name"] #append the record to the hash # append only if the record is not a duplicate record @@ -160,13 +162,14 @@ def compute_state() # if limits not set, set state to warning # if all records present, sort in descending order of metric, compute index based on StateThresholdPercentage, get the state (pass/fail/warn) based on monitor state (Using [Fail/Warn]ThresholdPercentage, and set the state) @memory_records.each{|k,v| + @@memory_last_sent_monitors.delete(k) #remove from last sent list if the record is present in the current set of signals calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_MEMORY_MONITOR_ID)) } @cpu_records.each{|k,v| + @@cpu_last_sent_monitors.delete(k) #remove from last sent list if the record is present in the current set of signals calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_CPU_MONITOR_ID)) } - @log.info "Finished computing state" end @@ -175,7 +178,6 @@ def get_records container_cpu_memory_records = [] @cpu_records.each{|resource_key, record| - cpu_limit_mc = 1.0 if record["limit"].is_a?(Numeric) cpu_limit_mc = record["limit"]/1000000.to_f @@ -221,6 +223,42 @@ def get_records container_cpu_memory_records.push(health_record) } + # If all records that were sent previously are present in current set, this will not be executed + if @@cpu_last_sent_monitors.keys.size != 0 + @@cpu_last_sent_monitors.keys.each{|key| + begin + @log.info "Container CPU monitor #{key} not present in current set. Sending none state transition" + tokens = key.split('_') + namespace = tokens[0] + workload_name = "#{tokens[0]}~~#{tokens[1]}" + container = tokens[2] + health_monitor_record = { + "timestamp" => time_now, + "state" => HealthMonitorStates::NONE, + "details" => { + "reason" => "No record received for workload #{workload_name}", + "workload_name" => workload_name, + "namespace" => namespace, + "container" => container + } + } + + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_CPU_MONITOR_ID, key.split('_')) #container_cpu_utilization-namespace-workload-container + + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_CPU_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + rescue => e + @log.info "Error when trying to create NONE State transition signal for #{key} for monitor #{monitor_instance_id} #{e.message}" + next + end + } + end + @memory_records.each{|resource_key, record| health_monitor_record = { "timestamp" => time_now, @@ -245,6 +283,52 @@ def get_records health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now container_cpu_memory_records.push(health_record) } + + # If all records that were sent previously are present in current set, this will not be executed + if @@memory_last_sent_monitors.keys.size != 0 + @@memory_last_sent_monitors.keys.each{|key| + begin + @log.info "Container Memory monitor #{key} not present in current set. Sending none state transition" + tokens = key.split('_') + namespace = tokens[0] + workload_name = "#{tokens[0]}~~#{tokens[1]}" + container = tokens[2] + health_monitor_record = { + "timestamp" => time_now, + "state" => HealthMonitorStates::NONE, + "details" => { + "reason" => "No record received for workload #{workload_name}", + "workload_name" => workload_name, + "namespace" => namespace, + "container" => container + } + } + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_MEMORY_MONITOR_ID, key.split('_')) #container_cpu_utilization-namespace-workload-container + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_MEMORY_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + rescue => e + @log.info "Error when trying to create NONE State transition signal for #{key} for monitor #{monitor_instance_id} #{e.message}" + next + end + } + end + + #reset the last sent monitors list + @@memory_last_sent_monitors = {} + @@cpu_last_sent_monitors = {} + + # add the current set of signals for comparison in next iteration + @cpu_records.keys.each{|k| + @@cpu_last_sent_monitors[k] = true + } + @memory_records.keys.each{|k| + @@memory_last_sent_monitors[k] = true + } return container_cpu_memory_records end @@ -298,4 +382,4 @@ def calculate_container_instance_state(counter_value, limit, config) end end end -end \ No newline at end of file +end