From 66ee409ffb994418efe66ec52555054e1adb02c4 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Tue, 22 Oct 2019 15:24:30 -0700 Subject: [PATCH 01/10] Bug Fixes 10222019 --- installer/conf/healthmonitorconfig.json | 2 +- .../plugin/filter_health_model_builder.rb | 18 ++++---- .../health_container_cpu_memory_aggregator.rb | 44 ++++++++++++------- .../plugin/health/health_monitor_utils.rb | 22 +++++----- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json index e4019fe73..21273f5a0 100644 --- a/installer/conf/healthmonitorconfig.json +++ b/installer/conf/healthmonitorconfig.json @@ -31,6 +31,6 @@ "ConsecutiveSamplesForStateTransition": 2 }, "node_condition": { - "NodeConditionTypesForFailedState": "outofdisk,networkunavailable" + "NodeConditionTypesForFailedState": "diskpressure,networkunavailable" } } \ No newline at end of file diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index afb514a73..533d4a9a2 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -106,14 +106,16 @@ def filter_stream(tag, es) es.each{|time, record| records.push(record) } - @buffer.add_to_buffer(records) - - container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - container_records_aggregator.aggregate(@container_cpu_memory_records) - container_records_aggregator.compute_state - aggregated_container_records = container_records_aggregator.get_records - @buffer.add_to_buffer(aggregated_container_records) - + @buffer.add_to_buffer(records) # in_kube_health records + + aggregated_container_records = [] + if !@container_cpu_memory_records.nil? && !@container_cpu_memory_records.empty? + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + container_records_aggregator.aggregate(@container_cpu_memory_records) + container_records_aggregator.compute_state + aggregated_container_records = container_records_aggregator.get_records + end + @buffer.add_to_buffer(aggregated_container_records) #container cpu/memory records records_to_process = @buffer.get_buffer @buffer.reset_buffer @container_cpu_memory_records = [] diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index ef1016158..097268439 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -163,11 +163,23 @@ def get_records container_cpu_memory_records = [] @cpu_records.each{|resource_key, record| + + cpu_limit_mc = 1.0 + if record["limit"].is_a?(Numeric) + cpu_limit_mc = record["limit"]/1000000.to_f + else + @log.info "CPU Limit is not a number #{record['limit']}" + if record["limit"].is_a?(Array) + if record["limit"].size > 0 + cpu_limit_mc = (record["limit"][0])/1000000.to_f + end + end + end health_monitor_record = { "timestamp" => time_now, "state" => record["state"], "details" => { - "cpu_limit_millicores" => record["limit"]/1000000.to_f, + "cpu_limit_millicores" => cpu_limit_mc, "cpu_usage_instances" => record["records"].map{|r| r.each {|k,v| k == "counter_value" ? r[k] = r[k] / 1000000.to_f : r[k] }}, @@ -219,12 +231,10 @@ def get_records private def calculate_monitor_state(v, config) - if !v['limit_set'] && v['namespace'] != 'kube-system' - v["state"] = HealthMonitorStates::WARNING - else - # sort records by descending order of metric - v["records"] = v["records"].sort_by{|record| record["counter_value"]}.reverse - size = v["records"].size + # sort records by descending order of metric + v["records"] = v["records"].sort_by{|record| record["counter_value"]}.reverse + size = v["records"].size + if !v["record_count"].nil? if size < v["record_count"] unknown_count = v["record_count"] - size for i in unknown_count.downto(1) @@ -232,16 +242,20 @@ def calculate_monitor_state(v, config) v["records"].insert(0, {"counter_value" => -1, "container" => v["container"], "pod_name" => "???", "state" => HealthMonitorStates::UNKNOWN }) #insert -1 for unknown records end end + else + v["state"] = HealthMonitorStates::UNKNOWN + @log.info "Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']}" + return #simply return the state as unknown here + end - if size == 1 - state_index = 0 - else - state_threshold = config['StateThresholdPercentage'].to_f - count = ((state_threshold*size)/100).ceil - state_index = size - count - end - v["state"] = v["records"][state_index]["state"] + if size == 1 + state_index = 0 + else + state_threshold = config['StateThresholdPercentage'].to_f + count = ((state_threshold*size)/100).ceil + state_index = size - count end + v["state"] = v["records"][state_index]["state"] end def calculate_container_instance_state(counter_value, limit, config) diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index e21fdc83d..4a91281a3 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -149,33 +149,33 @@ def get_pods_ready_hash(pod_inventory, deployment_inventory) def get_node_state_from_node_conditions(monitor_config, node_conditions) pass = false - failtypes = ['outofdisk', 'networkunavailable'].to_set #default fail types + failtypes = ['diskpressure', 'networkunavailable'].to_set #default fail types if !monitor_config.nil? && !monitor_config["NodeConditionTypesForFailedState"].nil? failtypes = monitor_config["NodeConditionTypesForFailedState"] - if !failtypes.nil? - failtypes = failtypes.split(',').map{|x| x.downcase}.map{|x| x.gsub(" ","")}.to_set - end + if !failtypes.nil? + failtypes = failtypes.split(',').map{|x| x.downcase}.map{|x| x.gsub(" ","")}.to_set + end end - log = get_log_handle - #log.info "Fail Types #{failtypes.inspect}" + log = get_log_handle + #log.info "Fail Types #{failtypes.inspect}" node_conditions.each do |condition| type = condition['type'] status = condition['status'] #for each condition in the configuration, check if the type is not false. If yes, update state to fail if (failtypes.include?(type.downcase) && (status == 'True' || status == 'Unknown')) - return "fail" - elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) - return "warn" + return HealthMonitorStates::FAIL + elsif ((type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) + return HealthMonitorStates::WARNING elsif type == "Ready" && status == 'True' pass = true end end if pass - return "pass" + return HealthMonitorStates::PASS else - return "fail" + return HealthMonitorStates::FAIL end end From 3e40825e29d4a45e454a6d7d7e3237f68ca0669c Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 24 Oct 2019 10:39:24 -0700 Subject: [PATCH 02/10] Initialize container_cpu_memory_records in fhmb --- source/code/plugin/filter_health_model_builder.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 533d4a9a2..6ba5a21c4 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -99,6 +99,10 @@ def filter_stream(tag, es) end container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) deduped_records = container_records_aggregator.dedupe_records(container_records) + if @container_cpu_memory_records.nil? + @log.info "@container_cpu_memory_records was not initialized" + @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. + end @container_cpu_memory_records.push(*deduped_records) # push the records for aggregation later return MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") From 9a389028a013d96896f405ce8b82a5c0bf92ce11 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 09:09:24 -0700 Subject: [PATCH 03/10] Added telemetry to investigate health exceptions --- .../plugin/filter_health_model_builder.rb | 9 ++++--- .../health_container_cpu_memory_aggregator.rb | 24 +++++++++++++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 6ba5a21c4..47ce7a631 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -39,17 +39,16 @@ def initialize @kube_api_down_handler = HealthKubeApiDownHandler.new @resources = HealthKubernetesResources.instance @reducer = HealthSignalReducer.new - @state = HealthMonitorState.new @generator = HealthMissingSignalGenerator.new - #TODO: cluster_labels needs to be initialized @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) - deserialized_state_info = @cluster_health_state.get_state - @state = HealthMonitorState.new - @state.initialize_state(deserialized_state_info) @cluster_old_state = 'none' @cluster_new_state = 'none' @container_cpu_memory_records = [] @telemetry = HealthMonitorTelemetry.new + @state = HealthMonitorState.new + # move network calls to the end. This will ensure all the instance variables get initialized + deserialized_state_info = @cluster_health_state.get_state + @state.initialize_state(deserialized_state_info) rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 097268439..f2cb77e20 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -1,4 +1,5 @@ require_relative 'health_model_constants' +require_relative '../ApplicationInsightsUtility' =begin @cpu_records/@memory_records [ @@ -37,6 +38,10 @@ class HealthContainerCpuMemoryAggregator @@memory_counter_name = 'memoryRssBytes' @@cpu_counter_name = 'cpuUsageNanoCores' + @@workload_container_count_empty_event_sent = false + @@limit_is_array_event_sent = false + @@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT = "WorkloadContainerCountEmptyEvent" + @@LIMIT_IS_ARRAY_EVENT = "ResourceLimitIsAnArrayEvent" def initialize(resources, provider) @pod_uid_lookup = resources.get_pod_uid_lookup @workload_container_count = resources.get_workload_container_count @@ -169,10 +174,11 @@ def get_records cpu_limit_mc = record["limit"]/1000000.to_f else @log.info "CPU Limit is not a number #{record['limit']}" - if record["limit"].is_a?(Array) - if record["limit"].size > 0 - cpu_limit_mc = (record["limit"][0])/1000000.to_f - end + if !@@limit_is_array_event_sent + custom_properties = {} + custom_properties['limit'] = record['limit'] + @@limit_is_array_event_sent = true + ApplicationInsightsUtility.sendCustomEvent(@@LIMIT_IS_ARRAY_EVENT, custom_properties) end end health_monitor_record = { @@ -244,7 +250,15 @@ def calculate_monitor_state(v, config) end else v["state"] = HealthMonitorStates::UNKNOWN - @log.info "Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']}" + @log.info "Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']} #{@workload_container_count}" + + if !@@workload_container_count_empty_event_sent + custom_properties = {} + custom_properties.merge(v) + custom_properties.merge(@workload_container_count) + @@workload_container_count_empty_event_sent = true + ApplicationInsightsUtility.sendCustomEvent(WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT, custom_properties) + end return #simply return the state as unknown here end From 0e87bfbc0d19c883e2afddb62c8613b5e70630c9 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 09:25:40 -0700 Subject: [PATCH 04/10] Conditionally load ApplicationInsightsUtility --- .../plugin/health/health_container_cpu_memory_aggregator.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index f2cb77e20..596481c84 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -1,5 +1,9 @@ require_relative 'health_model_constants' -require_relative '../ApplicationInsightsUtility' + +# otherwise unit tests will fail due to ApplicationInsightsUtility dependency on base omsagent ruby files. If you have your dev machine starting with omsagent-rs, then GOOD LUCK! +if Socket.gethostname.start_with?('omsagent-rs') + require_relative '../ApplicationInsightsUtility' +end =begin @cpu_records/@memory_records [ From cdf94354802356a957eef7b061ee4048cf7ef42c Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 09:25:54 -0700 Subject: [PATCH 05/10] Conditionally load ApplicationInsightsUtility --- .../code/plugin/health/health_container_cpu_memory_aggregator.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 596481c84..9e2ad519a 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -1,5 +1,6 @@ require_relative 'health_model_constants' +# Require only when running inside container. # otherwise unit tests will fail due to ApplicationInsightsUtility dependency on base omsagent ruby files. If you have your dev machine starting with omsagent-rs, then GOOD LUCK! if Socket.gethostname.start_with?('omsagent-rs') require_relative '../ApplicationInsightsUtility' From 95c8b728604b8a5349359974f1b29f1dc7afcaaa Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 10:19:18 -0700 Subject: [PATCH 06/10] Set frozen_string_literal to true --- source/code/plugin/health/agg_monitor_id_labels.rb | 1 + source/code/plugin/health/aggregate_monitor_state_finalizer.rb | 2 ++ source/code/plugin/health/cluster_health_state.rb | 2 ++ .../plugin/health/health_container_cpu_memory_aggregator.rb | 2 ++ .../health/health_container_cpu_memory_record_formatter.rb | 2 ++ source/code/plugin/health/health_hierarchy_builder.rb | 2 ++ source/code/plugin/health/health_kube_api_down_handler.rb | 2 ++ source/code/plugin/health/health_kubernetes_resources.rb | 2 ++ source/code/plugin/health/health_missing_signal_generator.rb | 2 ++ source/code/plugin/health/health_model_buffer.rb | 2 ++ source/code/plugin/health/health_model_builder.rb | 1 + source/code/plugin/health/health_model_constants.rb | 1 + source/code/plugin/health/health_model_definition_parser.rb | 1 + source/code/plugin/health/health_monitor_helpers.rb | 1 + source/code/plugin/health/health_monitor_optimizer.rb | 1 + source/code/plugin/health/health_monitor_provider.rb | 1 + source/code/plugin/health/health_monitor_record.rb | 1 + source/code/plugin/health/health_monitor_state.rb | 1 + source/code/plugin/health/health_monitor_telemetry.rb | 1 + source/code/plugin/health/health_monitor_utils.rb | 1 + source/code/plugin/health/health_signal_reducer.rb | 1 + source/code/plugin/health/monitor_factory.rb | 1 + source/code/plugin/health/parent_monitor_provider.rb | 1 + source/code/plugin/health/unit_monitor.rb | 1 + source/code/plugin/out_health_forward.rb | 1 + 25 files changed, 34 insertions(+) diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb index d5c724a86..03680d054 100644 --- a/source/code/plugin/health/agg_monitor_id_labels.rb +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb index 74e780924..dd69c9c4d 100644 --- a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb +++ b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel class AggregateMonitorStateFinalizer diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb index 3b56dd243..fa9cb42b2 100644 --- a/source/code/plugin/health/cluster_health_state.rb +++ b/source/code/plugin/health/cluster_health_state.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require "net/http" require "net/https" require "uri" diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 9e2ad519a..3ea1a2a07 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' # Require only when running inside container. diff --git a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb index 5c7db82d9..0c3f061f1 100644 --- a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb +++ b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel class HealthContainerCpuMemoryRecordFormatter diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb index 2da0050db..bb48e083b 100644 --- a/source/code/plugin/health/health_hierarchy_builder.rb +++ b/source/code/plugin/health/health_hierarchy_builder.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'json' module HealthModel class HealthHierarchyBuilder diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb index a87c43ef1..bb91f2e3b 100644 --- a/source/code/plugin/health/health_kube_api_down_handler.rb +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' module HealthModel class HealthKubeApiDownHandler diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 30a9ac7ca..c986b2db4 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'singleton' require_relative 'health_model_constants' diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb index 1827a0190..84af81ea7 100644 --- a/source/code/plugin/health/health_missing_signal_generator.rb +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' require_relative 'health_monitor_record' diff --git a/source/code/plugin/health/health_model_buffer.rb b/source/code/plugin/health/health_model_buffer.rb index 1ccfe7349..1c3ec3332 100644 --- a/source/code/plugin/health/health_model_buffer.rb +++ b/source/code/plugin/health/health_model_buffer.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel =begin diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb index 13813c8d9..43ed30d05 100644 --- a/source/code/plugin/health/health_model_builder.rb +++ b/source/code/plugin/health/health_model_builder.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'time' module HealthModel diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb index 0922c7ff2..c74f86f4d 100644 --- a/source/code/plugin/health/health_model_constants.rb +++ b/source/code/plugin/health/health_model_constants.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true module HealthModel class MonitorState CRITICAL = "fail" diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb index f6c7a781d..907bc1fd1 100644 --- a/source/code/plugin/health/health_model_definition_parser.rb +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true =begin Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor, and what labels to "pass on" to the parent monitor diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb index f784ae76e..74aa35af0 100644 --- a/source/code/plugin/health/health_monitor_helpers.rb +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'logger' require 'digest' require_relative 'health_model_constants' diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb index b33c8a986..a63d59abf 100644 --- a/source/code/plugin/health/health_monitor_optimizer.rb +++ b/source/code/plugin/health/health_monitor_optimizer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true module HealthModel class HealthMonitorOptimizer #ctor diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb index e75824268..b36c46370 100644 --- a/source/code/plugin/health/health_monitor_provider.rb +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/health_monitor_record.rb b/source/code/plugin/health/health_monitor_record.rb index 873736c3a..7df84ff53 100644 --- a/source/code/plugin/health/health_monitor_record.rb +++ b/source/code/plugin/health/health_monitor_record.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true HealthMonitorRecord = Struct.new( :monitor_id, :monitor_instance_id, diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index 8e2294cc9..16f8bedc4 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/health_monitor_telemetry.rb b/source/code/plugin/health/health_monitor_telemetry.rb index 4e80a5145..1227e1f83 100644 --- a/source/code/plugin/health/health_monitor_telemetry.rb +++ b/source/code/plugin/health/health_monitor_telemetry.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' require 'socket' if Socket.gethostname.start_with?('omsagent-rs') diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 4a91281a3..ce4f25395 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'logger' require 'digest' require_relative 'health_model_constants' diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb index f92f24ac3..4708c4ee5 100644 --- a/source/code/plugin/health/health_signal_reducer.rb +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb index 5f2c3945c..1e4f6f5b8 100644 --- a/source/code/plugin/health/monitor_factory.rb +++ b/source/code/plugin/health/monitor_factory.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'aggregate_monitor' require_relative 'unit_monitor' diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb index 4ab6e6297..e5766ea1b 100644 --- a/source/code/plugin/health/parent_monitor_provider.rb +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel class ParentMonitorProvider diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb index 9af599321..6454007b6 100644 --- a/source/code/plugin/health/unit_monitor.rb +++ b/source/code/plugin/health/unit_monitor.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' require 'json' diff --git a/source/code/plugin/out_health_forward.rb b/source/code/plugin/out_health_forward.rb index 18664a22a..6fcfe368b 100644 --- a/source/code/plugin/out_health_forward.rb +++ b/source/code/plugin/out_health_forward.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Fluentd # From 7bbe8a4c40aafc55d78076276baf804d10d01275 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 21:08:48 -0700 Subject: [PATCH 07/10] Send event once per container when lookup is empty, or limit is an array --- installer/conf/healthmonitorconfig.json | 2 +- .../health_container_cpu_memory_aggregator.rb | 28 ++++++++++++------- .../health/health_kubernetes_resources.rb | 5 ++++ .../plugin/health/health_monitor_utils.rb | 4 +-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json index 21273f5a0..e4019fe73 100644 --- a/installer/conf/healthmonitorconfig.json +++ b/installer/conf/healthmonitorconfig.json @@ -31,6 +31,6 @@ "ConsecutiveSamplesForStateTransition": 2 }, "node_condition": { - "NodeConditionTypesForFailedState": "diskpressure,networkunavailable" + "NodeConditionTypesForFailedState": "outofdisk,networkunavailable" } } \ No newline at end of file diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 3ea1a2a07..f6b57e0ae 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -45,8 +45,8 @@ class HealthContainerCpuMemoryAggregator @@memory_counter_name = 'memoryRssBytes' @@cpu_counter_name = 'cpuUsageNanoCores' - @@workload_container_count_empty_event_sent = false - @@limit_is_array_event_sent = false + @@workload_container_count_empty_event_sent = {} + @@limit_is_array_event_sent = {} @@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT = "WorkloadContainerCountEmptyEvent" @@LIMIT_IS_ARRAY_EVENT = "ResourceLimitIsAnArrayEvent" def initialize(resources, provider) @@ -181,10 +181,16 @@ def get_records cpu_limit_mc = record["limit"]/1000000.to_f else @log.info "CPU Limit is not a number #{record['limit']}" - if !@@limit_is_array_event_sent + if !@@limit_is_array_event_sent.key?(resource_key) custom_properties = {} custom_properties['limit'] = record['limit'] - @@limit_is_array_event_sent = true + if record['limit'].is_a?(Array) + record['limit'].each_index{|i| + custom_properties[i] = record['limit'][i] + } + end + @@limit_is_array_event_sent[resource_key] = true + #send once per resource key ApplicationInsightsUtility.sendCustomEvent(@@LIMIT_IS_ARRAY_EVENT, custom_properties) end end @@ -257,14 +263,16 @@ def calculate_monitor_state(v, config) end else v["state"] = HealthMonitorStates::UNKNOWN - @log.info "Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']} #{@workload_container_count}" + container_key = "#{v['workload_name']}~~#{v['container']}" + @log.info "ContainerKey: #{container_key} Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']} #{@workload_container_count}" - if !@@workload_container_count_empty_event_sent + if !@@workload_container_count_empty_event_sent.key?(container_key) custom_properties = {} - custom_properties.merge(v) - custom_properties.merge(@workload_container_count) - @@workload_container_count_empty_event_sent = true - ApplicationInsightsUtility.sendCustomEvent(WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT, custom_properties) + custom_properties = custom_properties.merge(v) + custom_properties = custom_properties.merge(@workload_container_count) + @log.info "Custom Properties : #{custom_properties}" + @@workload_container_count_empty_event_sent[container_key] = true + ApplicationInsightsUtility.sendCustomEvent(@@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT, custom_properties) end return #simply return the state as unknown here end diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index c986b2db4..4cbf39578 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -53,7 +53,12 @@ def get_workload_names end def build_pod_uid_lookup + if @pod_inventory.nil? || @pod_inventory['items'].nil? || @pod_inventory['items'].empty? || @pod_inventory['items'].size == 0 + @log.info "Not Clearing pod_uid_lookup and workload_container_count since pod inventory is nil" + return + end @workload_container_count = {} + @pod_uid_lookup = {} @pod_inventory['items'].each do |pod| begin namespace = pod['metadata']['namespace'] diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index ce4f25395..33eebc98b 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -150,7 +150,7 @@ def get_pods_ready_hash(pod_inventory, deployment_inventory) def get_node_state_from_node_conditions(monitor_config, node_conditions) pass = false - failtypes = ['diskpressure', 'networkunavailable'].to_set #default fail types + failtypes = ['outofdisk', 'networkunavailable'].to_set #default fail types if !monitor_config.nil? && !monitor_config["NodeConditionTypesForFailedState"].nil? failtypes = monitor_config["NodeConditionTypesForFailedState"] if !failtypes.nil? @@ -166,7 +166,7 @@ def get_node_state_from_node_conditions(monitor_config, node_conditions) #for each condition in the configuration, check if the type is not false. If yes, update state to fail if (failtypes.include?(type.downcase) && (status == 'True' || status == 'Unknown')) return HealthMonitorStates::FAIL - elsif ((type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) + elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) return HealthMonitorStates::WARNING elsif type == "Ready" && status == 'True' pass = true From 319a02fe45add9625a26b8398f36b1c92187bf51 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 30 Oct 2019 22:34:07 -0700 Subject: [PATCH 08/10] Use the right class variable --- source/code/plugin/in_kube_health.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 9a1b8f9a9..159c764a9 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -121,7 +121,7 @@ def enumerate health_monitor_records.push(record) if record end else - hmlog.info "POD INVENTORY IS NIL" + @@hmlog.info "POD INVENTORY IS NIL" end if !node_inventory.nil? @@ -130,7 +130,7 @@ def enumerate health_monitor_records.push(record) if record end else - hmlog.info "NODE INVENTORY IS NIL" + @@hmlog.info "NODE INVENTORY IS NIL" end health_monitor_records.each do |record| From 28e846e8f85f033dc7a25780623e144e9bf77065 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 1 Nov 2019 00:58:36 -0700 Subject: [PATCH 09/10] Unit Tests, Use RS and POD to determine workload --- .../health/health_kubernetes_resources.rb | 78 +- .../plugin/health/health_monitor_utils.rb | 54 +- source/code/plugin/in_kube_health.rb | 10 +- ...th_container_cpu_memory_aggregator_spec.rb | 8 +- .../health/health_kubernetes_resource_spec.rb | 26 +- .../health/health_model_builder_test.rb | 977 +++++++++--------- 6 files changed, 548 insertions(+), 605 deletions(-) diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 4cbf39578..954c090ac 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -7,20 +7,20 @@ module HealthModel class HealthKubernetesResources include Singleton - attr_accessor :node_inventory, :pod_inventory, :deployment_inventory, :pod_uid_lookup, :workload_container_count + attr_accessor :node_inventory, :pod_inventory, :replicaset_inventory, :pod_uid_lookup, :workload_container_count attr_reader :nodes, :pods, :workloads, :deployment_lookup def initialize - @node_inventory = [] - @pod_inventory = [] - @deployment_inventory = [] + @node_inventory = {} + @pod_inventory = {} + @replicaset_inventory = {} @nodes = [] @pods = [] @workloads = [] @log = HealthMonitorHelpers.get_log_handle @pod_uid_lookup = {} - @deployment_lookup = {} @workload_container_count = {} + @workload_name_cache = {} end def get_node_inventory @@ -38,9 +38,8 @@ def get_nodes return @nodes end - def set_deployment_inventory(deployments) - @deployment_inventory = deployments - @deployment_lookup = {} + def set_replicaset_inventory(replicasets) + @replicaset_inventory = replicasets end def get_workload_names @@ -99,7 +98,7 @@ def build_pod_uid_lookup end end rescue => e - @log.info "Error in build_pod_uid_lookup #{pod} #{e.message}" + @log.info "Error in build_pod_uid_lookup for POD: #{pod_name} #{e.message} #{e.backtrace}" end end end @@ -112,19 +111,7 @@ def get_workload_container_count return @workload_container_count end - private def get_workload_name(pod) - - if @deployment_lookup.empty? - @deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - end - begin has_owner = !pod['metadata']['ownerReferences'].nil? owner_kind = '' @@ -136,7 +123,6 @@ def get_workload_name(pod) controller_name = pod['metadata']['name'] end namespace = pod['metadata']['namespace'] - workload_name = '' if owner_kind.nil? owner_kind = 'Pod' @@ -146,41 +132,22 @@ def get_workload_name(pod) # we are excluding jobs return nil when 'replicaset' - # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name - labels = pod['metadata']['labels'].to_h - labels.each {|k,v| - lookup_key = "#{namespace}-#{k}=#{v}" - if @deployment_lookup.key?(lookup_key) - workload_name = @deployment_lookup[lookup_key] - break - end - } - if workload_name.empty? - workload_name = "#{namespace}~~#{controller_name}" - end + #TODO: + workload_name = get_replica_set_owner_ref(controller_name) + workload_name = "#{namespace}~~#{workload_name}" when 'daemonset' workload_name = "#{namespace}~~#{controller_name}" else - workload_name = "#{namespace}~~#{pod['metadata']['name']}" + workload_name = "#{namespace}~~#{controller_name}" end return workload_name rescue => e - @log.info "Error in get_workload_name(pod) #{e.message}" + @log.info "Error in get_workload_name(pod) #{e.message} #{e.backtrace}" return nil end end def get_workload_kind(pod) - if @deployment_lookup.empty? - @deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - end - begin has_owner = !pod['metadata']['ownerReferences'].nil? owner_kind = '' @@ -200,6 +167,25 @@ def get_workload_kind(pod) end end + private + def get_replica_set_owner_ref(controller_name) + if @workload_name_cache.key?(controller_name) + return @workload_name_cache[controller_name] + end + owner_ref = controller_name + @replicaset_inventory['items'].each{|rs| + rs_name = rs['metadata']['name'] + if controller_name.casecmp(rs_name) == 0 + if !rs['metadata']['ownerReferences'].nil? + owner_ref = rs['metadata']['ownerReferences'][0]['name'] if rs['metadata']['ownerReferences'][0]['name'] + end + break + end + } + @workload_name_cache[controller_name] = owner_ref + return owner_ref + end + def get_node_capacity(node_name, type) if node_name.nil? #unscheduled pods will not have a node name return -1 diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 33eebc98b..0d297d215 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -74,59 +74,17 @@ def is_cluster_health_model_enabled end end - def get_pods_ready_hash(pod_inventory, deployment_inventory) + def get_pods_ready_hash(resources) pods_ready_percentage_hash = {} - deployment_lookup = {} - deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - pod_inventory['items'].each do |pod| + resources.pod_inventory['items'].each do |pod| begin - has_owner = !pod['metadata']['ownerReferences'].nil? - owner_kind = '' - if has_owner - owner_kind = pod['metadata']['ownerReferences'][0]['kind'] - controller_name = pod['metadata']['ownerReferences'][0]['name'] - else - owner_kind = pod['kind'] - controller_name = pod['metadata']['name'] - #log.info "#{JSON.pretty_generate(pod)}" - end - + workload_name = resources.get_workload_name(pod) namespace = pod['metadata']['namespace'] status = pod['status']['phase'] - - workload_name = '' - if owner_kind.nil? - owner_kind = 'Pod' - end - case owner_kind.downcase - when 'job' - # we are excluding jobs + owner_kind = resources.get_workload_kind(pod) + if owner_kind.casecmp('job') == 0 next - when 'replicaset' - # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name - labels = pod['metadata']['labels'].to_h - labels.each {|k,v| - lookup_key = "#{namespace}-#{k}=#{v}" - if deployment_lookup.key?(lookup_key) - workload_name = deployment_lookup[lookup_key] - break - end - } - if workload_name.empty? - workload_name = "#{namespace}~~#{controller_name}" - end - when 'daemonset' - workload_name = "#{namespace}~~#{controller_name}" - else - workload_name = "#{namespace}~~#{pod['metadata']['name']}" end - if pods_ready_percentage_hash.key?(workload_name) total_pods = pods_ready_percentage_hash[workload_name]['totalPods'] pods_ready = pods_ready_percentage_hash[workload_name]['podsReady'] @@ -142,7 +100,7 @@ def get_pods_ready_hash(pod_inventory, deployment_inventory) pods_ready_percentage_hash[workload_name] = {'totalPods' => total_pods, 'podsReady' => pods_ready, 'namespace' => namespace, 'workload_name' => workload_name, 'kind' => owner_kind} rescue => e - log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" end end return pods_ready_percentage_hash diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 159c764a9..756c4ea25 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -86,11 +86,11 @@ def enumerate node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory - @resources.set_deployment_inventory(deployment_inventory) + @resources.set_replicaset_inventory(replicaset_inventory) @resources.build_pod_uid_lookup if node_inventory_response.code.to_i != 200 @@ -106,7 +106,7 @@ def enumerate health_monitor_records.push(record) if record record = process_memory_oversubscribed_monitor(pod_inventory, node_inventory) health_monitor_records.push(record) if record - pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory) + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(@resources) system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} @@ -291,11 +291,11 @@ def initialize_inventory node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory - @resources.set_deployment_inventory(deployment_inventory) + @resources.set_replicaset_inventory(replicaset_inventory) @resources.build_pod_uid_lookup end diff --git a/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb index 074878fe2..6972916bf 100644 --- a/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb +++ b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb @@ -25,7 +25,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -60,7 +60,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -113,7 +113,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -163,7 +163,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb index dbeec4858..f4daedace 100644 --- a/test/code/plugin/health/health_kubernetes_resource_spec.rb +++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb @@ -207,7 +207,7 @@ resources = HealthKubernetesResources.instance resources.node_inventory = nodes resources.pod_inventory = pods - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) #act parsed_nodes = resources.get_nodes parsed_workloads = resources.get_workload_names @@ -217,28 +217,6 @@ assert_equal parsed_workloads.size, 3 assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1'] - parsed_workloads.sort.must_equal ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'].sort + parsed_workloads.sort.must_equal ['default~~diliprdeploymentnodeapps-c4fdfb446', 'default~~rss-site', 'kube-system~~kube-proxy'].sort end - - # it 'builds the pod_uid lookup correctly' do - # #arrange - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json') - # nodes = JSON.parse(f) - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json') - # pods = JSON.parse(f) - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json') - # deployments = JSON.parse(f) - - # resources = HealthKubernetesResources.instance - - # resources.node_inventory = nodes - # resources.pod_inventory = pods - # resources.set_deployment_inventory(deployments) #resets deployment_lookup -- this was causing Unit test failures - - # resources.build_pod_uid_lookup - - # resources.pod_uid_lookup - # resources.workload_container_count - - # end end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb index a7c5e0927..3015ae55f 100644 --- a/test/code/plugin/health/health_model_builder_test.rb +++ b/test/code/plugin/health/health_model_builder_test.rb @@ -7,489 +7,510 @@ class FilterHealthModelBuilderTest < Test::Unit::TestCase include HealthModel - def test_event_stream - #setup - health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", - "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - } - - pods_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", - "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - for scenario in ["first", "second", "third"] - mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) - pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) - deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) - resources = HealthKubernetesResources.instance - resources.node_inventory = node_inventory - resources.pod_inventory = pod_inventory - resources.set_deployment_inventory(deployment_inventory) - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } - - if scenario == "first" - assert_equal 50, all_monitors.size - elsif scenario == "second" - assert_equal 34, all_monitors.size - elsif scenario == "third" - assert_equal 5, all_monitors.size - end - # for each key in monitor.keys, - # get the state from health_monitor_state - # generate the record to send - serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) - serializer.serialize(state) - - deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) - deserialized_state = deserializer.deserialize - - after_state = HealthMonitorState.new - after_state.initialize_state(deserialized_state) - end - end - - def test_event_stream_aks_engine - - #setup - health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", - #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - } - - pods_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", - #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', - 'container.azm.ms/cluster-name' => 'aks-engine-health' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - for scenario in 1..3 - mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) - pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) - deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) - resources = HealthKubernetesResources.instance - resources.node_inventory = node_inventory - resources.pod_inventory = pod_inventory - resources.deployment_inventory = deployment_inventory - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } - - if scenario == 1 - assert_equal 58, all_monitors.size - elsif scenario == 2 - assert_equal 37, all_monitors.size - elsif scenario == 3 - assert_equal 6, all_monitors.size - end - # for each key in monitor.keys, - # get the state from health_monitor_state - # generate the record to send - serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) - serializer.serialize(state) - - deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) - deserialized_state = deserializer.deserialize - - after_state = HealthMonitorState.new - after_state.initialize_state(deserialized_state) - end - end - - def test_container_memory_cpu_with_model - health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - } - - pods_file_map = { - "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - mock_data_path = "C:/Users/dilipr/desktop/health/container_cpu_memory/daemonset.json" - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json")) - pod_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json")) - deployment_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json")) + # def test_event_stream + # #setup + # health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + # "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # } + + # pods_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + # "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # for scenario in ["first", "second", "third"] + # mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) + # pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) + # deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.set_replicaset_inventory(deployment_inventory) + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + + # if scenario == "first" + # assert_equal 50, all_monitors.size + # elsif scenario == "second" + # assert_equal 34, all_monitors.size + # elsif scenario == "third" + # assert_equal 5, all_monitors.size + # end + # # for each key in monitor.keys, + # # get the state from health_monitor_state + # # generate the record to send + # serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + # serializer.serialize(state) + + # deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + # deserialized_state = deserializer.deserialize + + # after_state = HealthMonitorState.new + # after_state.initialize_state(deserialized_state) + # end + # end + + # def test_event_stream_aks_engine + + # #setup + # health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + # #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # } + + # pods_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + # #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', + # 'container.azm.ms/cluster-name' => 'aks-engine-health' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # for scenario in 1..3 + # mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) + # pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) + # deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.deployment_inventory = deployment_inventory + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + + # if scenario == 1 + # assert_equal 58, all_monitors.size + # elsif scenario == 2 + # assert_equal 37, all_monitors.size + # elsif scenario == 3 + # assert_equal 6, all_monitors.size + # end + # # for each key in monitor.keys, + # # get the state from health_monitor_state + # # generate the record to send + # serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + # serializer.serialize(state) + + # deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + # deserialized_state = deserializer.deserialize + + # after_state = HealthMonitorState.new + # after_state.initialize_state(deserialized_state) + # end + # end + + # def test_container_memory_cpu_with_model + # health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # } + + # pods_file_map = { + # "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # mock_data_path = "C:/Users/dilipr/desktop/health/container_cpu_memory/daemonset.json" + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json")) + # pod_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json")) + # deployment_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json")) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.set_replicaset_inventory(deployment_inventory) + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + + # #container memory cpu records + # file = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/cadvisor_perf.json') + # cadvisor_records = JSON.parse(file) + # cadvisor_records = cadvisor_records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + # formatted_records = [] + # formatter = HealthContainerCpuMemoryRecordFormatter.new + # cadvisor_records.each{|record| + # formatted_record = formatter.get_record_from_cadvisor_record(record) + # formatted_records.push(formatted_record) + # } + + # resources.build_pod_uid_lookup #call this in in_kube_health every min + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + # deduped_records = aggregator.dedupe_records(formatted_records) + # aggregator.aggregate(deduped_records) + # aggregator.compute_state + # container_cpu_memory_records = aggregator.get_records + + # records.concat(container_cpu_memory_records) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + # end + + def test_get_workload_name + # node_inventory = JSON.parse(File.read("C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-nodes.json")) + # pod_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-pods.json')) + # replicaset_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-rs.json')) + node_inventory = JSON.parse(File.read("C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-nodes.json")) + pod_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-pods.json')) + replicaset_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-rs.json')) resources = HealthKubernetesResources.instance resources.node_inventory = node_inventory resources.pod_inventory = pod_inventory - resources.set_deployment_inventory(deployment_inventory) - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - - #container memory cpu records - file = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/cadvisor_perf.json') - cadvisor_records = JSON.parse(file) - cadvisor_records = cadvisor_records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} - formatted_records = [] - formatter = HealthContainerCpuMemoryRecordFormatter.new - cadvisor_records.each{|record| - formatted_record = formatter.get_record_from_cadvisor_record(record) - formatted_records.push(formatted_record) + resources.set_replicaset_inventory(replicaset_inventory) + pod_inventory['items'].each{|pod| + workload_name = resources.get_workload_name(pod) + puts "POD #{pod['metadata']['name']} Workload Name #{workload_name}" } - resources.build_pod_uid_lookup #call this in in_kube_health every min + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(resources) - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) - deduped_records = aggregator.dedupe_records(formatted_records) - aggregator.aggregate(deduped_records) - aggregator.compute_state - container_cpu_memory_records = aggregator.get_records - - records.concat(container_cpu_memory_records) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } + puts JSON.pretty_generate(pods_ready_hash) end end \ No newline at end of file From 22686b236dfc89ca69b7e37661bc1a7db34f1b94 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 1 Nov 2019 15:43:23 -0700 Subject: [PATCH 10/10] Fixed Node Condition Bug, added exception handling to return get_rs_owner_ref --- .../health/health_kubernetes_resources.rb | 27 +++++++++++-------- source/code/plugin/in_kube_health.rb | 8 +++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 954c090ac..743dd8b94 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -172,18 +172,23 @@ def get_replica_set_owner_ref(controller_name) if @workload_name_cache.key?(controller_name) return @workload_name_cache[controller_name] end - owner_ref = controller_name - @replicaset_inventory['items'].each{|rs| - rs_name = rs['metadata']['name'] - if controller_name.casecmp(rs_name) == 0 - if !rs['metadata']['ownerReferences'].nil? - owner_ref = rs['metadata']['ownerReferences'][0]['name'] if rs['metadata']['ownerReferences'][0]['name'] + begin + owner_ref = controller_name + @replicaset_inventory['items'].each{|rs| + rs_name = rs['metadata']['name'] + if controller_name.casecmp(rs_name) == 0 + if !rs['metadata']['ownerReferences'].nil? + owner_ref = rs['metadata']['ownerReferences'][0]['name'] if rs['metadata']['ownerReferences'][0]['name'] + end + break end - break - end - } - @workload_name_cache[controller_name] = owner_ref - return owner_ref + } + @workload_name_cache[controller_name] = owner_ref + return owner_ref + rescue => e + @log.info "Error in get_replica_set_owner_ref(controller_name) #{e.message}" + return controller_name + end end def get_node_capacity(node_name, type) diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 756c4ea25..affbdd275 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -260,14 +260,14 @@ def process_node_condition_monitor(node_inventory) node_inventory['items'].each do |node| node_name = node['metadata']['name'] conditions = node['status']['conditions'] - state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) + node_state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) details = {} conditions.each do |condition| - state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL - details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => state} + condition_state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL + details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => condition_state} #@@hmlog.info "Node Condition details: #{JSON.pretty_generate(details)}" end - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + health_monitor_record = {"timestamp" => timestamp, "state" => node_state, "details" => details} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) health_record = {} time_now = Time.now.utc.iso8601