diff --git a/Rakefile b/Rakefile new file mode 100644 index 000000000..3733e71a3 --- /dev/null +++ b/Rakefile @@ -0,0 +1,9 @@ +require 'rake/testtask' + +task default: "test" + +Rake::TestTask.new do |task| + task.libs << "test" + task.pattern = './test/code/plugin/health/*_spec.rb' + task.warning = false +end \ No newline at end of file diff --git a/build/Makefile b/build/Makefile index b5312cfe3..257980160 100644 --- a/build/Makefile +++ b/build/Makefile @@ -91,9 +91,9 @@ CXXFLAGS = $(COMPILE_FLAGS) # Build targets ifeq ($(ULINUX),1) -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin rubypluginstests else -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin rubypluginstests endif clean : @@ -143,6 +143,15 @@ fluentbitplugin : make -C $(GO_SOURCE_DIR) fbplugin $(COPY) $(GO_SOURCE_DIR)/out_oms.so $(INTERMEDIATE_DIR) +rubypluginstests : + @echo "========================= Installing pre-reqs for running tests" + sudo apt-add-repository ppa:brightbox/ruby-ng -y + sudo apt-get update + sudo apt-get install ruby2.4 rake -y + sudo gem install minitest + @echo "========================= Running tests..." + rake test + #-------------------------------------------------------------------------------- # PAL build # diff --git a/installer/conf/container.conf b/installer/conf/container.conf index f41bd6f98..6d810a0e2 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -17,16 +17,22 @@ #cadvisor perf - type cadvisorperf - tag oms.api.cadvisorperf - run_interval 60s + type cadvisorperf + tag oms.api.cadvisorperf + run_interval 60s log_level debug + + type filter_cadvisor_health_node + log_level debug + + + #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes log_level info @@ -61,6 +67,25 @@ max_retry_wait 9m + + + @type forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + heartbeat_type tcp + + + host healthmodel-replicaset-service.kube-system + port 25227 + + + + @type file + path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + + + type out_mdm log_level debug diff --git a/installer/conf/health_model_definition.json b/installer/conf/health_model_definition.json new file mode 100644 index 000000000..1112fe158 --- /dev/null +++ b/installer/conf/health_model_definition.json @@ -0,0 +1,248 @@ +[ + { + "monitor_id": "user_workload_pods_ready", + "parent_monitor_id": "user_workload", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "user_workload", + "parent_monitor_id": "namespace", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "system_workload_pods_ready", + "parent_monitor_id": "system_workload", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "system_workload", + "parent_monitor_id": "k8s_infrastructure", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "kube_api_status", + "parent_monitor_id": "k8s_infrastructure", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "namespace", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": "all_namespaces" + }, + { + "monitor_id": "k8s_infrastructure", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "all_namespaces", + "parent_monitor_id": "all_workloads", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "all_workloads", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_cpu_utilization", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_memory_utilization", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_condition", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node", + "aggregation_algorithm": "worstOf", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": [ + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "master", + "id": "master_node_pool" + }, + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "id": "agent_node_pool" + } + ] + }, + { + "monitor_id": "master_node_pool", + "aggregation_algorithm": "percentage", + "aggregation_algorithm_params": { + "critical_threshold": 80.0, + "warning_threshold": 90.0 + }, + "parent_monitor_id": "all_nodes", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "agent_node_pool", + "aggregation_algorithm": "percentage", + "aggregation_algorithm_params": { + "state_threshold": 80.0 + }, + "labels": [ + "agentpool", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": "all_nodes" + }, + { + "monitor_id": "all_nodes", + "aggregation_algorithm": "worstOf", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "cluster", + "aggregation_algorithm": "worstOf", + "parent_monitor_id": null, + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "subscribed_capacity_cpu", + "parent_monitor_id": "capacity", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "subscribed_capacity_memory", + "parent_monitor_id": "capacity", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "capacity", + "parent_monitor_id": "all_workloads", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + } +] \ No newline at end of file diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json new file mode 100644 index 000000000..28d562652 --- /dev/null +++ b/installer/conf/healthmonitorconfig.json @@ -0,0 +1,31 @@ +{ + "node_cpu_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "node_memory_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "container_cpu_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "container_memory_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "user_workload_pods_ready": { + "WarnThresholdPercentage": 0.0, + "FailThresholdPercentage": 10.0, + "ConsecutiveSamplesForStateTransition": 2 + }, + "system_workload_pods_ready": { + "FailThresholdPercentage": 0.0, + "ConsecutiveSamplesForStateTransition": 2 + } +} \ No newline at end of file diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 0dfa3710e..4b4ec09ea 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -1,4 +1,9 @@ # Fluentd config file for OMS Docker - cluster components (kubeAPI) + + type forward + port 25227 + bind 0.0.0.0 + #Kubernetes pod inventory @@ -13,7 +18,7 @@ type kubeevents tag oms.containerinsights.KubeEvents run_interval 60s - log_level debug + log_level debug #Kubernetes logs @@ -47,6 +52,14 @@ log_level debug +#Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + #cadvisor perf- Windows nodes type wincadvisorperf @@ -69,6 +82,9 @@ log_level info + + type filter_health_model_builder + type out_mdm log_level debug @@ -118,7 +134,7 @@ type out_oms_api log_level debug - buffer_chunk_limit 10m + buffer_chunk_limit 10m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer buffer_queue_limit 10 @@ -127,6 +143,8 @@ retry_wait 30s + + type out_oms log_level debug @@ -170,7 +188,7 @@ max_retry_wait 9m - + type out_oms log_level debug num_threads 5 @@ -214,4 +232,16 @@ retry_limit 10 retry_wait 30s max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_KubeHealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 62a6f6885..3dc1a18cd 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -112,10 +112,45 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root -/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root +/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root + + +/opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/code/plugin/filter_cadvisor_health_node.rb; 644; root; root +/opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/code/plugin/filter_health_model_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_kube_health.rb; source/code/plugin/in_kube_health.rb; 644; root; root +/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; installer/conf/healthmonitorconfig.json; 644; root; root +/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; installer/conf/health_model_definition.json; 644; root; root + + +/opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/code/plugin/health/aggregate_monitor.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/code/plugin/health/agg_monitor_id_labels.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/code/plugin/health/aggregate_monitor_state_finalizer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/code/plugin/health/cluster_health_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/code/plugin/health/health_hierarchy_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/code/plugin/health/health_kubernetes_resources.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/code/plugin/health/health_kube_api_down_handler.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_missing_signal_generator.rb; source/code/plugin/health/health_missing_signal_generator.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_buffer.rb; source/code/plugin/health/health_model_buffer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_builder.rb; source/code/plugin/health/health_model_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_constants.rb; source/code/plugin/health/health_model_constants.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/parent_monitor_provider.rb; source/code/plugin/health/parent_monitor_provider.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/code/plugin/health/health_model_definition_parser.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/code/plugin/health/health_monitor_optimizer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/code/plugin/health/health_monitor_provider.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/code/plugin/health/health_monitor_record.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/code/plugin/health/health_monitor_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/code/plugin/health/health_monitor_utils.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/code/plugin/health/health_signal_reducer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/code/plugin/health/monitor_factory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/code/plugin/health/monitor_set.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/code/plugin/health/unit_monitor.rb; 644; root; root + %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -129,6 +164,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft; 755; root; root; sysdir /etc/opt/microsoft/docker-cimprov; 755; root; root /etc/opt/microsoft/docker-cimprov/conf; 755; root; root +/etc/opt/microsoft/docker-cimprov/health; 755; root; root /etc/opt/omi; 755; root; root; sysdir /etc/opt/omi/conf; 755; root; root; sysdir @@ -142,6 +178,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent; 755; root; root; sysdir /opt/microsoft/omsagent/plugin; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/health; 755; root; root; sysdir /opt/omi; 755; root; root; sysdir /opt/omi/lib; 755; root; root; sysdir @@ -205,12 +242,24 @@ touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log +touch /var/opt/microsoft/docker-cimprov/log/health_monitors.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/health_monitors.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/health_monitors.log + +touch /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log + +touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf %Postuninstall_10 # If we're an upgrade, skip all of this cleanup -if ${{PERFORMING_UPGRADE_NOT}}; then +if ${{PERFORMING_UPGRADE_NOT}}; then # Clean up installinfo.txt file (registered as "conf" file to pass rpmcheck) rm -f /etc/opt/microsoft/docker-cimprov/conf/installinfo.txt* rm -f /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index c72e64127..067586629 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -1,8 +1,10 @@ #!/usr/local/bin/ruby require_relative "tomlrb" +require 'json' -@configMapMountPath = "/etc/config/settings/log-data-collection-settings" +@log_settings_config_map_mount_path = "/etc/config/settings/log-data-collection-settings" +@agent_settings_config_map_mount_path = "/etc/config/settings/agent-settings" @configVersion = "" @configSchemaVersion = "" # Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist @@ -16,16 +18,16 @@ @excludePath = "*.csv2" #some invalid path # Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap +def parseConfigMap(path) begin # Check to see if config map is created - if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values" - parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted config map" + if (File.file?(path)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values from #{path}" + parsedConfig = Tomlrb.load_file(path, symbolize_keys: true) + puts "config::Successfully parsed mounted config map from #{path}" return parsedConfig else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults" + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for #{path}" @excludePath = "*_kube-system_*.log" return nil end @@ -117,19 +119,35 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::error::Exception while reading config settings for cluster level environment variable collection - #{errorStr}, using defaults" end end + + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + rescue => errorStr + puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" + @enable_health_model = false + end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap + configMapSettings = {} + + #iterate over every *settings file and build a hash of settings + Dir["/etc/config/settings/*settings"].each{|file| + puts "Parsing File #{file}" + settings = parseConfigMap(file) + configMapSettings = configMapSettings.merge(settings) + } + if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) end else - if (File.file?(@configMapMountPath)) puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" - end @excludePath = "*_kube-system_*.log" end @@ -155,6 +173,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + #health_model settings + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 61cbaea00..48b25bf14 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -30,13 +30,13 @@ def initialize end class << self - def getKubeResourceInfo(resource) + def getKubeResourceInfo(resource, api_version: nil) headers = {} response = nil - @Log.info "Getting Kube resource" + @Log.info "Getting Kube resource api_version #{api_version}" @Log.info resource begin - resourceUri = getResourceUri(resource) + resourceUri = getResourceUri(resource, api_version: api_version) if !resourceUri.nil? uri = URI.parse(resourceUri) http = Net::HTTP.new(uri.host, uri.port) @@ -76,10 +76,23 @@ def getTokenStr end end - def getResourceUri(resource) + def getClusterRegion + if ENV["AKS_REGION"] + return ENV["AKS_REGION"] + else + @Log.warn ("Kubernetes environment variable not set AKS_REGION. Unable to get cluster region.") + return nil + end + end + + def getResourceUri(resource, api_version: nil) begin if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + if !api_version.nil? + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + api_version + "/" + resource + end + api_version = @@ApiVersion + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + api_version + "/" + resource else @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") return nil @@ -125,6 +138,8 @@ def getClusterId return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId + # Dilipr: Spoof the subid by generating md5 hash of cluster name, and taking some constant parts of it. + # e.g. md5 digest is 128 bits = 32 character in hex. Get first 16 and get a guid, and the next 16 to get resource id @@ClusterId = getClusterName begin cluster = ENV["AKS_RESOURCE_ID"] diff --git a/source/code/plugin/filter_cadvisor_health_container.rb b/source/code/plugin/filter_cadvisor_health_container.rb new file mode 100644 index 000000000..4090092a9 --- /dev/null +++ b/source/code/plugin/filter_cadvisor_health_container.rb @@ -0,0 +1,263 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative 'HealthMonitorUtils' + require_relative 'HealthMonitorState' + require_relative "ApplicationInsightsUtility" + + + class CAdvisor2ContainerHealthFilter < Filter + Fluent::Plugin.register_filter('filter_cadvisor_health_container', self) + + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log' + config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' + config_param :container_resource_refresh_interval_minutes, :integer, :default => 5 + + @@object_name_k8s_node = 'K8SNode' + @@object_name_k8s_container = 'K8SContainer' + + @@counter_name_cpu = 'cpuusagenanocores' + @@counter_name_memory_rss = 'memoryrssbytes' + + @@health_monitor_config = {} + + @@hostName = (OMS::Common.get_hostname) + @@clusterName = KubernetesApiClient.getClusterName + @@clusterId = KubernetesApiClient.getClusterId + @@clusterRegion = KubernetesApiClient.getClusterRegion + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + super + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + @last_resource_refresh = DateTime.now.to_time.to_i + @metrics_to_collect_hash = {} + end + + def configure(conf) + super + @log = HealthMonitorUtils.getLogHandle + @log.debug {'Starting filter_cadvisor2health plugin'} + end + + def start + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) + @@health_monitor_config = HealthMonitorUtils.getHealthMonitorConfig + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + end + + def filter_stream(tag, es) + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_cadvisor_health_container" + return [] + end + new_es = MultiEventStream.new + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) + records_count = 0 + es.each { |time, record| + begin + filtered_record = filter(tag, time, record) + if !filtered_record.nil? + new_es.add(time, filtered_record) + records_count += 1 + end + rescue => e + router.emit_error_event(tag, time, record, e) + end + } + @log.debug "Filter Records Count #{records_count}" + new_es + end + + def filter(tag, time, record) + begin + if record.key?("MonitorLabels") + return record + end + object_name = record['DataItems'][0]['ObjectName'] + counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + if @metrics_to_collect_hash.key?(counter_name.downcase) + metric_value = record['DataItems'][0]['Collections'][0]['Value'] + case object_name + when @@object_name_k8s_container + case counter_name.downcase + when @@counter_name_cpu + # @log.debug "Object Name #{object_name}" + # @log.debug "Counter Name #{counter_name}" + # @log.debug "Metric Value #{metric_value}" + #return process_container_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #return process_container_memory_record(record, metric_value) + end + when @@object_name_k8s_node + case counter_name.downcase + when @@counter_name_cpu + #process_node_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #process_node_memory_record(record, metric_value) + end + end + end + rescue => e + @log.debug "Error in filter #{e}" + @log.debug "record #{record}" + @log.debug "backtrace #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e) + return nil + end + end + + def process_container_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID + @log.debug "processing container cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + cpu_limit = container_metadata['cpuLimit'] + end + + if cpu_limit.to_s.empty? + #@log.info "CPU Limit is nil" + cpu_limit = @cpu_capacity + end + + #@log.info "cpu limit #{cpu_limit}" + + percent = (metric_value.to_f/cpu_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container CPU #{temp}" + return record + end + return nil + end + + def process_container_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID + #@log.debug "processing container memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + memory_limit = container_metadata['memoryLimit'] + end + + if memory_limit.to_s.empty? + #@log.info "Memory Limit is nil" + memory_limit = @memory_capacity + end + + #@log.info "memory limit #{memory_limit}" + + percent = (metric_value.to_f/memory_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container Memory #{temp}" + return record + end + return nil + end + + def process_node_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID + #@log.debug "processing node cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "CPU capacity #{@cpu_capacity}" + + percent = (metric_value.to_f/@cpu_capacity*100).round(2) + #@log.debug "Percentage of CPU limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_CPU_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName]) + # record = HealthMonitorSignalReducer.reduceSignal(@log, monitor_id, monitor_instance_id, @@health_monitor_config[monitor_id], node_name: @@hostName) + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node CPU" + return health_record + end + return nil + end + + def process_node_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID + #@log.debug "processing node memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "Memory capacity #{@memory_capacity}" + + percent = (metric_value.to_f/@memory_capacity*100).round(2) + #@log.debug "Percentage of Memory limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_MEMORY_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node Memory" + return health_record + end + return nil + end + end +end diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb new file mode 100644 index 000000000..627a525e7 --- /dev/null +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -0,0 +1,267 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative "ApplicationInsightsUtility" + require_relative "KubernetesApiClient" + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + + class CAdvisor2NodeHealthFilter < Filter + include HealthModel + Fluent::Plugin.register_filter('filter_cadvisor_health_node', self) + + attr_accessor :provider, :resources + + config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' + config_param :container_resource_refresh_interval_minutes, :integer, :default => 5 + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + + @@object_name_k8s_node = 'K8SNode' + @@object_name_k8s_container = 'K8SContainer' + + @@counter_name_cpu = 'cpuusagenanocores' + @@counter_name_memory_rss = 'memoryrssbytes' + + @@hm_log = HealthMonitorUtils.get_log_handle + @@hostName = (OMS::Common.get_hostname) + @@clusterName = KubernetesApiClient.getClusterName + @@clusterId = KubernetesApiClient.getClusterId + @@clusterRegion = KubernetesApiClient.getClusterRegion + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + begin + super + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + @last_resource_refresh = DateTime.now.to_time.to_i + @metrics_to_collect_hash = {} + @resources = HealthKubernetesResources.instance # this doesnt require node and pod inventory. So no need to populate them + @provider = HealthMonitorProvider.new(@@clusterId, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def configure(conf) + super + @log = HealthMonitorUtils.get_log_handle + @log.debug {'Starting filter_cadvisor2health plugin'} + end + + def start + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + end + + def filter_stream(tag, es) + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" + return [] + end + new_es = MultiEventStream.new + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) + records_count = 0 + es.each { |time, record| + begin + filtered_record = filter(tag, time, record) + if !filtered_record.nil? + new_es.add(time, filtered_record) + records_count += 1 + end + rescue => e + @log.info "Error in filter_stream for filter_cadvisor_health_node #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + } + @log.debug "Filter Records Count #{records_count}" + new_es + end + + def filter(tag, time, record) + begin + if record.key?("MonitorLabels") + return record + end + object_name = record['DataItems'][0]['ObjectName'] + counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + if @metrics_to_collect_hash.key?(counter_name.downcase) + metric_value = record['DataItems'][0]['Collections'][0]['Value'] + case object_name + when @@object_name_k8s_container + case counter_name.downcase + when @@counter_name_cpu + # @log.debug "Object Name #{object_name}" + # @log.debug "Counter Name #{counter_name}" + # @log.debug "Metric Value #{metric_value}" + #return process_container_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #return process_container_memory_record(record, metric_value) + end + when @@object_name_k8s_node + case counter_name.downcase + when @@counter_name_cpu + process_node_cpu_record(record, metric_value) + when @@counter_name_memory_rss + process_node_memory_record(record, metric_value) + end + end + end + rescue => e + @log.debug "Error in filter #{e}" + @log.debug "record #{record}" + @log.debug "backtrace #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e) + return nil + end + end + + def process_container_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID + @log.debug "processing container cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + cpu_limit = container_metadata['cpuLimit'] + end + + if cpu_limit.to_s.empty? + #@log.info "CPU Limit is nil" + cpu_limit = @cpu_capacity + end + + #@log.info "cpu limit #{cpu_limit}" + + percent = (metric_value.to_f/cpu_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(monitor_id)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container CPU #{temp}" + return record + end + return nil + end + + def process_container_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID + #@log.debug "processing container memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + memory_limit = container_metadata['memoryLimit'] + end + + if memory_limit.to_s.empty? + #@log.info "Memory Limit is nil" + memory_limit = @memory_capacity + end + + #@log.info "memory limit #{memory_limit}" + + percent = (metric_value.to_f/memory_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container Memory #{temp}" + return record + end + return nil + end + + def process_node_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID + #@log.debug "processing node cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "CPU capacity #{@cpu_capacity}" + + percent = (metric_value.to_f/@cpu_capacity*100).round(2) + #@log.debug "Percentage of CPU limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_CPU_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node CPU" + return health_record + end + return nil + end + + def process_node_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID + #@log.debug "processing node memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "Memory capacity #{@memory_capacity}" + + percent = (metric_value.to_f/@memory_capacity*100).round(2) + #@log.debug "Percentage of Memory limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_MEMORY_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node Memory" + return health_record + end + return nil + end + end +end diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb new file mode 100644 index 000000000..0c1b378a0 --- /dev/null +++ b/source/code/plugin/filter_health_model_builder.rb @@ -0,0 +1,233 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. + +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + + + class FilterHealthModelBuilder < Filter + Fluent::Plugin.register_filter('filter_health_model_builder', self) + + config_param :enable_log, :integer, :default => 0 + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log' + config_param :model_definition_path, :default => '/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json' + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + config_param :health_state_serialized_path, :default => '/mnt/azure/health_model_state.json' + attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator + include HealthModel + + @@rewrite_tag = 'oms.api.KubeHealth.AgentCollectionTime' + @@cluster_id = KubernetesApiClient.getClusterId + @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + begin + super + @buffer = HealthModel::HealthModelBuffer.new + @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) + @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) + @monitor_factory = HealthModel::MonitorFactory.new + @hierarchy_builder = HealthHierarchyBuilder.new(@health_model_definition, @monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + @state_finalizers = [HealthModel::AggregateMonitorStateFinalizer.new] + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + @kube_api_down_handler = HealthKubeApiDownHandler.new + @resources = HealthKubernetesResources.instance + @reducer = HealthSignalReducer.new + @state = HealthMonitorState.new + @generator = HealthMissingSignalGenerator.new + #TODO: cluster_labels needs to be initialized + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + deserialized_state_info = @cluster_health_state.get_state + @state = HealthMonitorState.new + @state.initialize_state(deserialized_state_info) + @cluster_old_state = 'none' + @cluster_new_state = 'none' + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def configure(conf) + begin + super + @log = nil + if @enable_log + @log = Logger.new(@log_path, 'weekly') + @log.info 'Starting filter_health_model_builder plugin' + end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def start + super + end + + def shutdown + super + end + + def filter_stream(tag, es) + begin + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_health_model_builder" + return [] + end + new_es = MultiEventStream.new + time = Time.now + + if tag.start_with?("oms.api.KubeHealth.DaemonSet") + records = [] + if !es.nil? + es.each{|time, record| + records.push(record) + } + @buffer.add_to_buffer(records) + end + return [] + elsif tag.start_with?("oms.api.KubeHealth.ReplicaSet") + @log.info "TAG #{tag}" + records = [] + es.each{|time, record| + records.push(record) + } + @buffer.add_to_buffer(records) + records_to_process = @buffer.get_buffer + @buffer.reset_buffer + + health_monitor_records = [] + records_to_process.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + #HealthMonitorRecord + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + @provider.get_labels(record), + @provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + health_monitor_records.push(health_monitor_record) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + @log.info "health_monitor_records.size #{health_monitor_records.size}" + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + # update state for the reduced set of signals + reduced_records = @reducer.reduce_signals(health_monitor_records, @resources) + reduced_records.each{|record| + @state.update_state(record, + @provider.get_config(record.monitor_id) + ) + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + record.state = @state.get_state(record.monitor_instance_id).new_state + } + @log.info "after deduping and removing gone objects reduced_records.size #{reduced_records.size}" + + reduced_records = @kube_api_down_handler.handle_kube_api_down(reduced_records) + @log.info "after kube api down handler health_monitor_records.size #{health_monitor_records.size}" + + #get the list of 'none' and 'unknown' signals + missing_signals = @generator.get_missing_signals(@@cluster_id, reduced_records, @resources, @provider) + + @log.info "after getting missing signals missing_signals.size #{missing_signals.size}" + #update state for missing signals + missing_signals.each{|signal| + + @state.update_state(signal, @provider.get_config(signal.monitor_id)) + @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}" + # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state + signal.state = @state.get_state(signal.monitor_instance_id).new_state + } + + @generator.update_last_received_records(reduced_records) + all_records = reduced_records.clone + all_records.push(*missing_signals) + + @log.info "after Adding missing signals all_records.size #{all_records.size}" + + # build the health model + @model_builder.process_records(all_records) + all_monitors = @model_builder.finalize_model + + @log.info "after building health_model #{all_monitors.size}" + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + @state.update_state(monitor, + @provider.get_config(monitor.monitor_id) + ) + end + + instance_state = @state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" + + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + all_monitors.keys.each{|key| + record = @provider.get_record(all_monitors[key], state) + if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER && all_monitors.size > 1 + old_state = record[HealthMonitorRecordFields::OLD_STATE] + new_state = record[HealthMonitorRecordFields::NEW_STATE] + if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state + ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size}) + @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}" + @cluster_old_state = old_state + @cluster_new_state = new_state + end + end + #@log.info "#{record["Details"]} #{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + new_es.add(time, record) + } + + #emit the stream + router.emit_stream(@@rewrite_tag, new_es) + + #initialize monitor_set and model_builder + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + + #update cluster state custom resource + @cluster_health_state.update_state(@state.to_h) + + # return an empty event stream, else the match will throw a NoMethodError + return [] + elsif tag.start_with?("oms.api.KubeHealth.AgentCollectionTime") + # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream + es + else + raise 'Invalid tag #{tag} received' + end + + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" + return nil + end + end + end +end diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb new file mode 100644 index 000000000..48ca46184 --- /dev/null +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -0,0 +1,26 @@ +module HealthModel + class AggregateMonitorInstanceIdLabels + @@id_labels_mapping = { + MonitorId::SYSTEM_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], + MonitorId::USER_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], + MonitorId::NODE => [HealthMonitorLabels::AGENTPOOL, HealthMonitorLabels::ROLE, HealthMonitorLabels::HOSTNAME], + MonitorId::NAMESPACE => [HealthMonitorLabels::NAMESPACE], + MonitorId::AGENT_NODE_POOL => [HealthMonitorLabels::AGENTPOOL], + # MonitorId::ALL_AGENT_NODE_POOLS => [], + # MonitorId::ALL_NODE_POOLS => [], + # MonitorId::ALL_NODES => [], + # MonitorId::K8S_INFRASTRUCTURE => [], + # MonitorId::CLUSTER => [], + # MonitorId::WORKLOAD => [] + } + + def self.get_labels_for(monitor_id) + if @@id_labels_mapping.key?(monitor_id) + return @@id_labels_mapping[monitor_id] + else + return [] + end + + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/aggregate_monitor.rb b/source/code/plugin/health/aggregate_monitor.rb new file mode 100644 index 000000000..794f716ce --- /dev/null +++ b/source/code/plugin/health/aggregate_monitor.rb @@ -0,0 +1,193 @@ +# frozen_string_literal: true + +require_relative 'health_model_constants' +require 'json' + +module HealthModel + class AggregateMonitor + attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :aggregation_algorithm, :aggregation_algorithm_params, :labels, :is_aggregate_monitor, :details + attr_reader :member_monitors, :member_state_counts + + @@sort_key_order = { + MonitorState::UNKNOWN => 1, + MonitorState::CRITICAL => 2, + MonitorState::WARNING => 3, + MonitorState::HEALTHY => 4, + MonitorState::NONE => 5 + } + + # constructor + def initialize( + monitor_id, + monitor_instance_id, + state, + transition_date_time, + aggregation_algorithm, + aggregation_algorithm_params, + labels + ) + @monitor_id = monitor_id + @monitor_instance_id = monitor_instance_id + @state = state + @transition_date_time = transition_date_time + @aggregation_algorithm = aggregation_algorithm || AggregationAlgorithm::WORSTOF + @aggregation_algorithm_params = aggregation_algorithm_params + @labels = labels + @member_monitors = {} + @member_state_counts = {} + @is_aggregate_monitor = true + end + + # adds a member monitor as a child + def add_member_monitor(member_monitor_instance_id) + unless @member_monitors.key?(member_monitor_instance_id) + @member_monitors[member_monitor_instance_id] = true + end + end + + #removes a member monitor + def remove_member_monitor(member_monitor_instance_id) + if @member_monitors.key?(member_monitor_instance_id) + @member_monitors.delete(member_monitor_instance_id) + end + end + + # return the member monitors as an array + def get_member_monitors + @member_monitors.map(&:first) + end + + # calculates the state of the aggregate monitor based on aggregation algorithm and child monitor states + def calculate_state(monitor_set) + case @aggregation_algorithm + when AggregationAlgorithm::WORSTOF + @state = calculate_worst_of_state(monitor_set) + when AggregationAlgorithm::PERCENTAGE + @state = calculate_percentage_state(monitor_set) + else + raise 'No aggregation algorithm specified' + end + end + + def calculate_details(monitor_set) + @details = {} + @details['details'] = {} + @details['state'] = state + @details['timestamp'] = transition_date_time + ids = [] + member_monitor_instance_ids = get_member_monitors + member_monitor_instance_ids.each{|member_monitor_id| + member_monitor = monitor_set.get_monitor(member_monitor_id) + member_state = member_monitor.state + if @details['details'].key?(member_state) + ids = @details['details'][member_state] + if !ids.include?(member_monitor.monitor_instance_id) + ids.push(member_monitor.monitor_instance_id) + end + @details['details'][member_state] = ids + else + @details['details'][member_state] = [member_monitor.monitor_instance_id] + end + } + end + + # calculates the worst of state, given the member monitors + def calculate_worst_of_state(monitor_set) + + @member_state_counts = map_member_monitor_states(monitor_set) + + if member_state_counts.length === 0 + return MonitorState::NONE + end + + if member_state_counts.key?(MonitorState::CRITICAL) && member_state_counts[MonitorState::CRITICAL] > 0 + return MonitorState::CRITICAL + end + if member_state_counts.key?(MonitorState::ERROR) && member_state_counts[MonitorState::ERROR] > 0 + return MonitorState::ERROR + end + if member_state_counts.key?(MonitorState::WARNING) && member_state_counts[MonitorState::WARNING] > 0 + return MonitorState::WARNING + end + + if member_state_counts.key?(MonitorState::UNKNOWN) && member_state_counts[MonitorState::UNKNOWN] > 0 + return MonitorState::UNKNOWN + end + + if member_state_counts.key?(MonitorState::HEALTHY) && member_state_counts[MonitorState::HEALTHY] > 0 + return MonitorState::HEALTHY #healthy should win over none in aggregation + end + + return MonitorState::NONE + + end + + # calculates a percentage state, given the aggregation algorithm parameters + def calculate_percentage_state(monitor_set) + + #sort + #TODO: What if sorted_filtered is empty? is that even possible? + sorted_filtered = sort_filter_member_monitors(monitor_set) + + state_threshold = @aggregation_algorithm_params['state_threshold'].to_f + + size = sorted_filtered.size + if size == 1 + @state = sorted_filtered[0].state + else + count = ((state_threshold*size)/100).ceil + index = size - count + @state = sorted_filtered[index].state + end + end + + # maps states of member monitors to counts + def map_member_monitor_states(monitor_set) + member_monitor_instance_ids = get_member_monitors + if member_monitor_instance_ids.nil? || member_monitor_instance_ids.size == 0 + return {} + end + + state_counts = {} + + member_monitor_instance_ids.each {|monitor_instance_id| + + member_monitor = monitor_set.get_monitor(monitor_instance_id) + monitor_state = member_monitor.state + + if !state_counts.key?(monitor_state) + state_counts[monitor_state] = 1 + else + count = state_counts[monitor_state] + state_counts[monitor_state] = count+1 + end + } + + return state_counts; + end + + # Sort the member monitors in the following order +=begin + 1. Error + 2. Unknown + 3. Critical + 4. Warning + 5. Healthy + Remove 'none' state monitors +=end + def sort_filter_member_monitors(monitor_set) + member_monitor_instance_ids = get_member_monitors + member_monitors = [] + + member_monitor_instance_ids.each {|monitor_instance_id| + member_monitor = monitor_set.get_monitor(monitor_instance_id) + member_monitors.push(member_monitor) + } + + filtered = member_monitors.select{|monitor| monitor.state != MonitorState::NONE} + sorted = filtered.sort_by{ |monitor| [@@sort_key_order[monitor.state]] } + + return sorted + end + end +end diff --git a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb new file mode 100644 index 000000000..74e780924 --- /dev/null +++ b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb @@ -0,0 +1,33 @@ +module HealthModel + class AggregateMonitorStateFinalizer + + def finalize(monitor_set) + top_level_monitor = monitor_set.get_monitor(MonitorId::CLUSTER) + if !top_level_monitor.nil? + calculate_subtree_state(top_level_monitor, monitor_set) + end + monitor_set.get_map.each{|k,v| + if v.is_aggregate_monitor + v.calculate_details(monitor_set) + end + } + end + + private + def calculate_subtree_state(monitor, monitor_set) + if monitor.nil? || !monitor.is_aggregate_monitor + raise 'AggregateMonitorStateFinalizer:calculateSubtreeState Parameter monitor must be non-null AggregateMonitor' + end + + member_monitor_instance_ids = monitor.get_member_monitors # monitor_instance_ids + member_monitor_instance_ids.each{|member_monitor_instance_id| + member_monitor = monitor_set.get_monitor(member_monitor_instance_id) + + if !member_monitor.nil? && member_monitor.is_aggregate_monitor + calculate_subtree_state(member_monitor, monitor_set) + end + } + monitor.calculate_state(monitor_set) + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb new file mode 100644 index 000000000..ac7e05675 --- /dev/null +++ b/source/code/plugin/health/cluster_health_state.rb @@ -0,0 +1,115 @@ +require "net/http" +require "net/https" +require "uri" + +module HealthModel + class ClusterHealthState + + attr_reader :token_file_path, :cert_file_path, :log, :http_client, :uri, :token + @@resource_uri_template = "%{kube_api_server_url}/apis/azmon.container.insights/v1/namespaces/kube-system/healthstates/cluster-health-state" + + def initialize(token_file_path, cert_file_path) + @token_file_path = token_file_path + @cert_file_path = cert_file_path + @log = HealthMonitorHelpers.get_log_handle + @http_client = get_http_client + @token = get_token + end + + def update_state(state) + get_request = Net::HTTP::Get.new(@uri.request_uri) + + get_request["Authorization"] = "Bearer #{@token}" + @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + + if get_response.code.to_i == 404 # NOT found + #POST + update_request = Net::HTTP::Post.new(@uri.request_uri) + update_request["Content-Type"] = "application/json" + + elsif get_response.code.to_i == 200 # Update == Patch + #PATCH + update_request = Net::HTTP::Patch.new(@uri.request_uri) + update_request["Content-Type"] = "application/merge-patch+json" + end + update_request["Authorization"] = "Bearer #{@token}" + + update_request_body = get_update_request_body + update_request_body["state"] = state.to_json + update_request.body = update_request_body.to_json + + update_response = @http_client.request(update_request) + @log.info "Got a response of #{update_response.code} for #{update_request.method}" + end + + def get_state + get_request = Net::HTTP::Get.new(@uri.request_uri) + get_request["Authorization"] = "Bearer #{@token}" + @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + + if get_response.code.to_i == 200 + return JSON.parse(JSON.parse(get_response.body)["state"]) + else + return {} + end + end + + private + def get_token() + begin + if File.exist?(@token_file_path) && File.readable?(@token_file_path) + token_str = File.read(@token_file_path).strip + return token_str + else + @log.info ("Unable to read token string from #{@token_file_path}") + return nil + end + end + end + + def get_http_client() + kube_api_server_url = get_kube_api_server_url + resource_uri = @@resource_uri_template % { + kube_api_server_url: kube_api_server_url + } + @uri = URI.parse(resource_uri) + http = Net::HTTP.new(@uri.host, @uri.port) + http.use_ssl = true + if !File.exist?(@cert_file_path) + raise "#{@cert_file_path} doesnt exist" + else + http.ca_file = @cert_file_path + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + return http + end + + def get_kube_api_server_url + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}" + else + @log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + if Gem.win_platform? #unit testing on windows dev machine + value = %x( kubectl -n default get endpoints kubernetes --no-headers) + url = "https://#{value.split(' ')[1]}" + return "https://localhost:8080" # This is NEVER used. this is just to return SOME value + end + return nil + end + end + + def get_update_request_body + body = {} + body["apiVersion"] = "azmon.container.insights/v1" + body["kind"] = "HealthState" + body["metadata"] = {} + body["metadata"]["name"] = "cluster-health-state" + body["metadata"]["namespace"] = "kube-system" + return body + end + end +end diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb new file mode 100644 index 000000000..2da0050db --- /dev/null +++ b/source/code/plugin/health/health_hierarchy_builder.rb @@ -0,0 +1,76 @@ +require 'json' +module HealthModel + class HealthHierarchyBuilder + + attr_accessor :health_model_definition, :monitor_factory + + def initialize(health_model_definition, monitor_factory) + + if !health_model_definition.is_a?(ParentMonitorProvider) + raise "Invalid Type Expected: ParentMonitorProvider Actual: #{@health_model_definition.class.name}" + end + @health_model_definition = health_model_definition + + if !monitor_factory.is_a?(MonitorFactory) + raise "Invalid Type Expected: MonitorFactory Actual: #{@monitor_factory.class.name}" + end + @monitor_factory = monitor_factory + end + + def process_record(health_monitor_record, monitor_set) + if !health_monitor_record.is_a?(HealthMonitorRecord) + raise "Unexpected Type #{health_monitor_record.class}" + end + + # monitor state transition will always be on a unit monitor + child_monitor = @monitor_factory.create_unit_monitor(health_monitor_record) + monitor_set.add_or_update(child_monitor) + parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor) + monitor_labels = child_monitor.labels + monitor_id = child_monitor.monitor_id + + # to construct the parent monitor, + # 1. Child's labels + # 2. Parent monitor's config to determine what labels to copy + # 3. Parent Monitor Id + # 4. Monitor Id --> Labels to hash Mapping to generate the monitor instance id for aggregate monitors + + while !parent_monitor_id.nil? + #puts "Parent Monitor Id #{parent_monitor_id}" + # get the set of labels to copy to parent monitor + parent_monitor_labels = @health_model_definition.get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id) + # get the parent monitor configuration + parent_monitor_configuration = @health_model_definition.get_parent_monitor_config(parent_monitor_id) + #get monitor instance id for parent monitor. Does this belong in ParentMonitorProvider? + parent_monitor_instance_id = @health_model_definition.get_parent_monitor_instance_id(child_monitor.monitor_instance_id, parent_monitor_id, parent_monitor_labels) + # check if monitor set has the parent monitor id + # if not present, add + # if present, update the state based on the aggregation algorithm + parent_monitor = nil + if !monitor_set.contains?(parent_monitor_instance_id) + parent_monitor = @monitor_factory.create_aggregate_monitor(parent_monitor_id, parent_monitor_instance_id, parent_monitor_labels, parent_monitor_configuration['aggregation_algorithm'], parent_monitor_configuration['aggregation_algorithm_params'], child_monitor) + parent_monitor.add_member_monitor(child_monitor.monitor_instance_id) + else + parent_monitor = monitor_set.get_monitor(parent_monitor_instance_id) + # required to calculate the rollup state + parent_monitor.add_member_monitor(child_monitor.monitor_instance_id) + # update to the earliest of the transition times of child monitors + if child_monitor.transition_date_time < parent_monitor.transition_date_time + parent_monitor.transition_date_time = child_monitor.transition_date_time + end + end + + if parent_monitor.nil? + raise 'Parent_monitor should not be nil for #{monitor_id}' + end + + monitor_set.add_or_update(parent_monitor) + + child_monitor = parent_monitor + parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor) + monitor_labels = child_monitor.labels + monitor_id = child_monitor.monitor_id + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb new file mode 100644 index 000000000..7f7ba1bd3 --- /dev/null +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -0,0 +1,27 @@ +module HealthModel + class HealthKubeApiDownHandler + def initialize + @@monitors_to_change = [HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID, + HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID, + HealthMonitorConstants::NODE_CONDITION_MONITOR_ID, + HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, + HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID] + end + + # update kube-api dependent monitors to be 'unknown' if kube-api is down or monitor is unavailable + def handle_kube_api_down(health_monitor_records) + health_monitor_records_map = {} + + health_monitor_records.map{|record| health_monitor_records_map[record.monitor_instance_id] = record} + if !health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) || (health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) && health_monitor_records_map[HealthMonitorConstants::KUBE_API_STATUS].state != 'pass') + #iterate over the map and set the state to unknown for related monitors + health_monitor_records.each{|health_monitor_record| + if @@monitors_to_change.include?(health_monitor_record.monitor_id) + health_monitor_record.state = HealthMonitorStates::UNKNOWN + end + } + end + return health_monitor_records + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb new file mode 100644 index 000000000..53f879bf5 --- /dev/null +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -0,0 +1,102 @@ +require 'singleton' + +module HealthModel + class HealthKubernetesResources + + include Singleton + attr_accessor :node_inventory, :pod_inventory, :deployment_inventory + attr_reader :nodes, :pods, :workloads + + def initialize + @node_inventory = [] + @pod_inventory = [] + @deployment_inventory = [] + @nodes = [] + @pods = [] + @workloads = [] + @log = HealthMonitorHelpers.get_log_handle + end + + def get_node_inventory + return @node_inventory + end + + def get_nodes + @nodes = [] + @node_inventory['items'].each {|node| + if !@nodes.include?(node['metadata']['name']) + @nodes.push(node['metadata']['name']) + end + + } + return @nodes + end + + def get_pod_inventory + return @pod_inventory + end + + def get_pods + return @pods + end + + def get_workload_names + @pods = [] + workload_names = {} + deployment_lookup = {} + @deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + @pod_inventory['items'].each do |pod| + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + controller_name = pod['metadata']['ownerReferences'][0]['name'] + else + owner_kind = pod['kind'] + controller_name = pod['metadata']['name'] + end + + namespace = pod['metadata']['namespace'] + + workload_name = '' + if owner_kind.nil? + owner_kind = 'Pod' + end + case owner_kind.downcase + when 'job' + # we are excluding jobs + next + when 'replicaset' + # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name + labels = pod['metadata']['labels'].to_h + labels.each {|k,v| + lookup_key = "#{namespace}-#{k}=#{v}" + if deployment_lookup.key?(lookup_key) + workload_name = deployment_lookup[lookup_key] + break + end + } + if workload_name.empty? + workload_name = "#{namespace}~~#{controller_name}" + end + when 'daemonset' + workload_name = "#{namespace}~~#{controller_name}" + else + workload_name = "#{namespace}~~#{pod['metadata']['name']}" + end + rescue => e + @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + end + workload_names[workload_name] = true + end + return workload_names.keys + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb new file mode 100644 index 000000000..ff7f6a390 --- /dev/null +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -0,0 +1,142 @@ +module HealthModel + class HealthMissingSignalGenerator + attr_accessor :last_received_records, :current_received_records + attr_reader :missing_signals, :unknown_signals_hash + + def initialize() + @last_received_records = {} + @unknown_signals_hash = {} + end + + def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory, provider) + missing_monitor_ids = [] + nodes = health_k8s_inventory.get_nodes + workload_names = health_k8s_inventory.get_workload_names + missing_signals_map = {} + missing_signals = [] + health_monitor_records_map = {} + health_monitor_records.map{ + |monitor| health_monitor_records_map[monitor.monitor_instance_id] = monitor + } + + node_signals_hash = {} + nodes.each{|node| + node_signals_hash[node] = [HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID] + } + log = HealthMonitorHelpers.get_log_handle + log.info "last_received_records #{@last_received_records.size} nodes #{nodes}" + @last_received_records.each{|monitor_instance_id, monitor| + if !health_monitor_records_map.key?(monitor_instance_id) + if HealthMonitorHelpers.is_node_monitor(monitor.monitor_id) + node_name = monitor.labels['kubernetes.io/hostname'] + new_monitor = HealthMonitorRecord.new( + monitor.monitor_id, + monitor.monitor_instance_id, + Time.now.utc.iso8601, + monitor.state, + monitor.labels, + monitor.config, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""} + ) + if !node_name.nil? && nodes.include?(node_name) + new_monitor.state = HealthMonitorStates::UNKNOWN + new_monitor.details["state"] = HealthMonitorStates::UNKNOWN + new_monitor.details["details"] = "Node present in inventory but no signal for #{monitor.monitor_id} from node #{node_name}" + @unknown_signals_hash[monitor_instance_id] = new_monitor + elsif !node_name.nil? && !nodes.include?(node_name) + new_monitor.state = HealthMonitorStates::NONE + new_monitor.details["state"] = HealthMonitorStates::NONE + new_monitor.details["details"] = "Node NOT present in inventory. node: #{node_name}" + end + missing_signals_map[monitor_instance_id] = new_monitor + log.info "Added missing signal #{new_monitor.monitor_instance_id} #{new_monitor.state}" + elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor.monitor_id) + lookup = "#{monitor.labels[HealthMonitorLabels::NAMESPACE]}~~#{monitor.labels[HealthMonitorLabels::WORKLOAD_NAME]}" + new_monitor = HealthMonitorRecord.new( + monitor.monitor_id, + monitor.monitor_instance_id, + Time.now.utc.iso8601, + monitor.state, + monitor.labels, + monitor.config, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""} + ) + if !lookup.nil? && workload_names.include?(lookup) + new_monitor.state = HealthMonitorStates::UNKNOWN + new_monitor.details["state"] = HealthMonitorStates::UNKNOWN + new_monitor.details["details"] = "Workload present in inventory. But no signal for #{lookup}" + @unknown_signals_hash[monitor_instance_id] = new_monitor + elsif !lookup.nil? && !workload_names.include?(lookup) + new_monitor.state = HealthMonitorStates::NONE + new_monitor.details["state"] = HealthMonitorStates::NONE + new_monitor.details["details"] = "Workload #{lookup} NOT present in inventory" + end + missing_signals_map[monitor_instance_id] = new_monitor + end + end + } + + + health_monitor_records.each{|health_monitor_record| + # remove signals from the list of expected signals if we see them in the list of current signals + if HealthMonitorHelpers.is_node_monitor(health_monitor_record.monitor_id) + node_name = health_monitor_record.labels['kubernetes.io/hostname'] + if node_signals_hash.key?(node_name) + signals = node_signals_hash[node_name] + signals.delete(health_monitor_record.monitor_id) + if signals.size == 0 + node_signals_hash.delete(node_name) + end + end + end + } + + # if the hash is not empty, means we have missing signals + if node_signals_hash.size > 0 + # these signals were not sent previously + # these signals need to be assigned an unknown state + node_signals_hash.each{|node, monitor_ids| + monitor_ids.each{|monitor_id| + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(monitor_id, [cluster_id, node]) + new_monitor = HealthMonitorRecord.new( + monitor_id, + monitor_instance_id, + Time.now.utc.iso8601, + HealthMonitorStates::UNKNOWN, + provider.get_node_labels(node), + {}, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => "no signal received from node #{node}"} + ) + missing_signals_map[monitor_instance_id] = new_monitor + log.info "Added missing signal when node_signals_hash was not empty #{new_monitor.monitor_instance_id} #{new_monitor.state}" + } + } + end + + missing_signals_map.each{|k,v| + missing_signals.push(v) + } + + # if an unknown signal is present neither in missing signals or the incoming signals, change its state to none, and remove from unknown_signals + # in update_state of HealthMonitorState, send if latest_record_state is none + @unknown_signals_hash.each{|k,v| + if !missing_signals_map.key?(k) && !health_monitor_records_map.key?(k) + monitor_record = @unknown_signals_hash[k] + monitor_record.details["state"] = HealthMonitorStates::NONE # used for calculating the old and new states in update_state + monitor_record.state = HealthMonitorStates::NONE #used for calculating the aggregate monitor state + missing_signals.push(monitor_record) + @unknown_signals_hash.delete(k) + log.info "Updating state from unknown to none for #{k}" + end + } + return missing_signals + end + + def update_last_received_records(last_received_records) + last_received_records_map = {} + last_received_records.map {|record| last_received_records_map[record.monitor_instance_id] = record } + @last_received_records = last_received_records_map + end + end + +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_buffer.rb b/source/code/plugin/health/health_model_buffer.rb new file mode 100644 index 000000000..1ccfe7349 --- /dev/null +++ b/source/code/plugin/health/health_model_buffer.rb @@ -0,0 +1,29 @@ +module HealthModel + +=begin + Class that is used to create a buffer for collecting the health records +=end + class HealthModelBuffer + + attr_reader :records_buffer, :log + + def initialize + @records_buffer = [] + end + + # Returns the current buffer + def get_buffer + return @records_buffer + end + + # adds records to the buffer + def add_to_buffer(records) + @records_buffer.push(*records) + end + + # clears/resets the buffer + def reset_buffer + @records_buffer = [] + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb new file mode 100644 index 000000000..4cf802798 --- /dev/null +++ b/source/code/plugin/health/health_model_builder.rb @@ -0,0 +1,37 @@ +require_relative 'health_model_constants' +require 'time' + +module HealthModel + class HealthModelBuilder + attr_accessor :hierarchy_builder, :state_finalizers, :monitor_set + + def initialize(hierarchy_builder, state_finalizers, monitor_set) + @hierarchy_builder = hierarchy_builder + @state_finalizers = state_finalizers + @monitor_set = monitor_set + end + + def process_records(health_records) + health_records.each{|health_record| + @hierarchy_builder.process_record(health_record, @monitor_set) + } + end + + def finalize_model + if !@state_finalizers.is_a?(Array) + raise 'state finalizers should be an array' + end + + if @state_finalizers.length == 0 + raise '@state_finalizers length should not be zero or empty' + end + + @state_finalizers.each{|finalizer| + finalizer.finalize(@monitor_set) + } + + return @monitor_set.get_map + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb new file mode 100644 index 000000000..82ae569f3 --- /dev/null +++ b/source/code/plugin/health/health_model_constants.rb @@ -0,0 +1,81 @@ +module HealthModel + class MonitorState + CRITICAL = "fail" + ERROR = "err" + WARNING = "warn" + NONE = "none" + HEALTHY = "pass" + UNKNOWN = "unknown" + end + + class AggregationAlgorithm + WORSTOF = "worstOf" + PERCENTAGE = "percentage" + end + + class MonitorId + CLUSTER = 'cluster'; + ALL_NODES = 'all_nodes'; + K8S_INFRASTRUCTURE = 'k8s_infrastructure' + + NODE = 'node'; + AGENT_NODE_POOL = 'agent_node_pool' + MASTER_NODE_POOL = 'master_node_pool' + ALL_AGENT_NODE_POOLS = 'all_agent_node_pools' + ALL_NODE_POOLS = 'all_node_pools'; + + WORKLOAD = 'all_workloads'; + CAPACITY = 'capacity'; + + USER_WORKLOAD = 'user_workload'; + SYSTEM_WORKLOAD = 'system_workload' + NAMESPACE = 'namespace'; + end + + class HealthMonitorRecordFields + CLUSTER_ID = "ClusterId" + MONITOR_ID = "MonitorId" + MONITOR_INSTANCE_ID = "MonitorInstanceId" + MONITOR_LABELS = "MonitorLabels" + DETAILS = "Details" + MONITOR_CONFIG = "MonitorConfig" + OLD_STATE = "OldState" + NEW_STATE = "NewState" + AGENT_COLLECTION_TIME = "AgentCollectionTime" + TIME_FIRST_OBSERVED = "TimeFirstObserved" + NODE_NAME = "NodeName" + NAMESPACE = "Namespace" + end + + class HealthMonitorConstants + NODE_CPU_MONITOR_ID = "node_cpu_utilization" + NODE_MEMORY_MONITOR_ID = "node_memory_utilization" + CONTAINER_CPU_MONITOR_ID = "container_cpu_utilization" + CONTAINER_MEMORY_MONITOR_ID = "container_memory_utilization" + NODE_CONDITION_MONITOR_ID = "node_condition" + WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_cpu" + WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_memory" + WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID = "container_cpu_utilization" + WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID = "container_memory_utilization" + KUBE_API_STATUS = "kube_api_status" + USER_WORKLOAD_PODS_READY_MONITOR_ID = "user_workload_pods_ready" + SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID = "system_workload_pods_ready" + end + + class HealthMonitorStates + PASS = "pass" + FAIL = "fail" + WARNING = "warn" + NONE = "none" + UNKNOWN = "unknown" + end + + class HealthMonitorLabels + WORKLOAD_NAME = "container.azm.ms/workload-name" + WORKLOAD_KIND = "container.azm.ms/workload-kind" + NAMESPACE = "container.azm.ms/namespace" + AGENTPOOL = "agentpool" + ROLE = "kubernetes.io/role" + HOSTNAME = "kubernetes.io/hostname" + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb new file mode 100644 index 000000000..f6c7a781d --- /dev/null +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -0,0 +1,50 @@ +=begin + Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor, + and what labels to "pass on" to the parent monitor +=end +require 'json' + +module HealthModel + class HealthModelDefinitionParser + attr_accessor :health_model_definition_path, :health_model_definition + + # Constructor + def initialize(path) + @health_model_definition = {} + @health_model_definition_path = path + end + + # Parse the health model definition file and build the model roll-up hierarchy + def parse_file + if (!File.exist?(@health_model_definition_path)) + raise "File does not exist in the specified path" + end + + file = File.read(@health_model_definition_path) + temp_model = JSON.parse(file) + temp_model.each { |entry| + monitor_id = entry['monitor_id'] + parent_monitor_id = entry['parent_monitor_id'] + labels = entry['labels'] if entry['labels'] + aggregation_algorithm = entry['aggregation_algorithm'] if entry['aggregation_algorithm'] + aggregation_algorithm_params = entry['aggregation_algorithm_params'] if entry['aggregation_algorithm_params'] + if parent_monitor_id.is_a?(Array) + conditions = [] + parent_monitor_id.each{|condition| + key = condition['label'] + operator = condition['operator'] + value = condition['value'] + parent_id = condition['id'] + conditions.push({"key" => key, "operator" => operator, "value" => value, "parent_id" => parent_id}) + } + @health_model_definition[monitor_id] = {"conditions" => conditions, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + elsif parent_monitor_id.is_a?(String) + @health_model_definition[monitor_id] = {"parent_monitor_id" => parent_monitor_id, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + elsif parent_monitor_id.nil? + @health_model_definition[monitor_id] = {"parent_monitor_id" => nil, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + end + } + @health_model_definition + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb new file mode 100644 index 000000000..9e2977a0e --- /dev/null +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -0,0 +1,36 @@ +require 'logger' +require 'digest' + +module HealthModel + # static class that provides a bunch of utility methods + class HealthMonitorHelpers + + @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log" + + if Gem.win_platform? #unit testing on windows dev machine + @log_path = "C:\Temp\health_monitors.log" + end + + @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + + class << self + def is_node_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + end + + def is_pods_ready_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + end + + def get_log_handle + return @log + end + + def get_monitor_instance_id(monitor_id, args = []) + string_to_hash = args.join("/") + return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}" + end + end + + end +end diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb new file mode 100644 index 000000000..b33c8a986 --- /dev/null +++ b/source/code/plugin/health/health_monitor_optimizer.rb @@ -0,0 +1,52 @@ +module HealthModel + class HealthMonitorOptimizer + #ctor + def initialize + @@health_signal_timeout = 240 + @@first_record_sent = {} + end + + def should_send(monitor_instance_id, health_monitor_state, health_monitor_config) + + health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id) + health_monitor_records = health_monitor_instance_state.prev_records + health_monitor_config['ConsecutiveSamplesForStateTransition'].nil? ? samples_to_check = 1 : samples_to_check = health_monitor_config['ConsecutiveSamplesForStateTransition'].to_i + + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + new_state = health_monitor_instance_state.new_state + prev_sent_time = health_monitor_instance_state.prev_sent_record_time + time_first_observed = health_monitor_instance_state.state_change_time + + if latest_record_state.downcase == new_state.downcase + time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60 + if time_elapsed > @@health_signal_timeout # minutes + return true + elsif !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + return true + else + return false + end + else + if samples_to_check == 1 + return true + elsif health_monitor_instance_state.prev_records.size == 1 && samples_to_check > 1 + return true + elsif health_monitor_instance_state.prev_records.size < samples_to_check + return false + else + # state change from previous sent state to latest record state + #check state of last n records to see if they are all in the same state + if (health_monitor_instance_state.is_state_change_consistent) + return true + else + return false + end + end + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb new file mode 100644 index 000000000..0c1cbf7f2 --- /dev/null +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -0,0 +1,123 @@ +module HealthModel + class HealthMonitorProvider + + attr_accessor :cluster_labels, :health_kubernetes_resources, :monitor_configuration_path, :cluster_id + attr_reader :monitor_configuration + + def initialize(cluster_id, cluster_labels, health_kubernetes_resources, monitor_configuration_path) + @cluster_labels = Hash.new + cluster_labels.each{|k,v| @cluster_labels[k] = v} + @cluster_id = cluster_id + @health_kubernetes_resources = health_kubernetes_resources + @monitor_configuration_path = monitor_configuration_path + begin + @monitor_configuration = {} + file = File.open(@monitor_configuration_path, "r") + if !file.nil? + fileContents = file.read + @monitor_configuration = JSON.parse(fileContents) + file.close + end + rescue => e + @log.info "Error when opening health config file #{e}" + end + end + + def get_record(health_monitor_record, health_monitor_state) + + labels = Hash.new + @cluster_labels.each{|k,v| labels[k] = v} + monitor_id = health_monitor_record.monitor_id + monitor_instance_id = health_monitor_record.monitor_instance_id + health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id) + + + monitor_labels = health_monitor_record.labels + if !monitor_labels.empty? + monitor_labels.keys.each do |key| + labels[key] = monitor_labels[key] + end + end + + prev_records = health_monitor_instance_state.prev_records + time_first_observed = health_monitor_instance_state.state_change_time # the oldest collection time + new_state = health_monitor_instance_state.new_state # this is updated before formatRecord is called + old_state = health_monitor_instance_state.old_state + + config = get_config(monitor_id) + + if prev_records.size == 1 + details = prev_records[0] + else + details = prev_records + end + + time_observed = Time.now.utc.iso8601 + + monitor_record = {} + + monitor_record[HealthMonitorRecordFields::CLUSTER_ID] = @cluster_id + monitor_record[HealthMonitorRecordFields::MONITOR_LABELS] = labels.to_json + monitor_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + monitor_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + monitor_record[HealthMonitorRecordFields::NEW_STATE] = new_state + monitor_record[HealthMonitorRecordFields::OLD_STATE] = old_state + monitor_record[HealthMonitorRecordFields::DETAILS] = details.to_json + monitor_record[HealthMonitorRecordFields::MONITOR_CONFIG] = config.to_json + monitor_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = Time.now.utc.iso8601 + monitor_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_first_observed + + return monitor_record + end + + def get_config(monitor_id) + if @monitor_configuration.key?(monitor_id) + return @monitor_configuration[monitor_id] + else + return {} + end + end + + def get_labels(health_monitor_record) + monitor_labels = Hash.new + @cluster_labels.keys.each{|key| + monitor_labels[key] = @cluster_labels[key] + } + monitor_id = health_monitor_record[HealthMonitorRecordFields::MONITOR_ID] + case monitor_id + when HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID, HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID + + namespace = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['namespace'] + workload_name = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadName'] + workload_kind = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadKind'] + + monitor_labels[HealthMonitorLabels::WORKLOAD_NAME] = workload_name.split('~~')[1] + monitor_labels[HealthMonitorLabels::WORKLOAD_KIND] = workload_kind + monitor_labels[HealthMonitorLabels::NAMESPACE] = namespace + + when HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + node_name = health_monitor_record[HealthMonitorRecordFields::NODE_NAME] + @health_kubernetes_resources.get_node_inventory['items'].each do |node| + if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name'] + if !node["metadata"].nil? && !node["metadata"]["labels"].nil? + monitor_labels = monitor_labels.merge(node["metadata"]["labels"]) + end + end + end + end + return monitor_labels + end + + def get_node_labels(node_name) + monitor_labels = {} + @health_kubernetes_resources.get_node_inventory['items'].each do |node| + if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name'] + if !node["metadata"].nil? && !node["metadata"]["labels"].nil? + monitor_labels = node["metadata"]["labels"] + end + end + end + return monitor_labels + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_record.rb b/source/code/plugin/health/health_monitor_record.rb new file mode 100644 index 000000000..873736c3a --- /dev/null +++ b/source/code/plugin/health/health_monitor_record.rb @@ -0,0 +1,10 @@ +HealthMonitorRecord = Struct.new( + :monitor_id, + :monitor_instance_id, + :transition_date_time, + :state, + :labels, + :config, + :details + ) do +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb new file mode 100644 index 000000000..c3df5e3a9 --- /dev/null +++ b/source/code/plugin/health/health_monitor_state.rb @@ -0,0 +1,214 @@ +module HealthModel + + HealthMonitorInstanceState = Struct.new(:prev_sent_record_time, :old_state, :new_state, :state_change_time, :prev_records, :is_state_change_consistent, :should_send) do + end + + # Class that is used to store the last sent state and latest monitors + # provides services like + # get_state -- returns the current state and details + # update_instance -- updates the state of the health monitor history records + # set_state -- sets the last health monitor state + class HealthMonitorState + + def initialize + @@monitor_states = {} + @@first_record_sent = {} + @@health_signal_timeout = 240 + end + + def get_state(monitor_instance_id) + if @@monitor_states.key?(monitor_instance_id) + return @@monitor_states[monitor_instance_id] + end + end + + def set_state(monitor_instance_id, health_monitor_instance_state) + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + end + + def to_h + return @@monitor_states + end + + def initialize_state(deserialized_state) + @@monitor_states = {} + deserialized_state.each{|k,v| + health_monitor_instance_state_hash = JSON.parse(v) + state = HealthMonitorInstanceState.new(*health_monitor_instance_state_hash.values_at(*HealthMonitorInstanceState.members)) + state.prev_sent_record_time = health_monitor_instance_state_hash["prev_sent_record_time"] + state.old_state = health_monitor_instance_state_hash["old_state"] + state.new_state = health_monitor_instance_state_hash["new_state"] + state.state_change_time = health_monitor_instance_state_hash["state_change_time"] + state.prev_records = health_monitor_instance_state_hash["prev_records"] + state.is_state_change_consistent = health_monitor_instance_state_hash["is_state_change_consistent"] || false + state.should_send = health_monitor_instance_state_hash["should_send"] + @@monitor_states[k] = state + @@first_record_sent[k] = true + + } + end + +=begin +when do u send? +--------------- +1. if the signal hasnt been sent before +2. if there is a "consistent" state change for monitors +3. if the signal is stale (> 4hrs) +4. If the latest state is none +=end + def update_state(monitor, #UnitMonitor/AggregateMonitor + monitor_config #Hash + ) + samples_to_keep = 1 + monitor_instance_id = monitor.monitor_instance_id + log = HealthMonitorHelpers.get_log_handle + current_time = Time.now.utc.iso8601 + health_monitor_instance_state = get_state(monitor_instance_id) + if !health_monitor_instance_state.nil? + health_monitor_instance_state.is_state_change_consistent = false + health_monitor_instance_state.should_send = false + set_state(monitor_instance_id, health_monitor_instance_state) # reset is_state_change_consistent + end + + if !monitor_config.nil? && !monitor_config['ConsecutiveSamplesForStateTransition'].nil? + samples_to_keep = monitor_config['ConsecutiveSamplesForStateTransition'].to_i + end + + if @@monitor_states.key?(monitor_instance_id) + health_monitor_instance_state = @@monitor_states[monitor_instance_id] + health_monitor_records = health_monitor_instance_state.prev_records #This should be an array + + if health_monitor_records.size == samples_to_keep + health_monitor_records.delete_at(0) + end + health_monitor_records.push(monitor.details) + health_monitor_instance_state.prev_records = health_monitor_records + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + else + # if samples_to_keep == 1, then set new state to be the health_monitor_record state, else set it as none + + old_state = HealthMonitorStates::NONE + new_state = HealthMonitorStates::NONE + if samples_to_keep == 1 + new_state = monitor.state + end + + health_monitor_instance_state = HealthMonitorInstanceState.new( + monitor.transition_date_time, + old_state, + new_state, + monitor.transition_date_time, + [monitor.details]) + + health_monitor_instance_state.should_send = true + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + end + + + # update old and new state based on the history and latest record. + # TODO: this is a little hairy. Simplify + + health_monitor_records = health_monitor_instance_state.prev_records + if monitor_config['ConsecutiveSamplesForStateTransition'].nil? + samples_to_check = 1 + else + samples_to_check = monitor_config['ConsecutiveSamplesForStateTransition'].to_i + end + + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + new_state = health_monitor_instance_state.new_state + prev_sent_time = health_monitor_instance_state.prev_sent_record_time + + # if the last sent state (new_state is different from latest monitor state) + if latest_record_state.downcase == new_state.downcase + time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60 + # check if health signal has "timed out" + if time_elapsed > @@health_signal_timeout # minutes + # update record for last sent record time + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + #log.debug "After Updating Monitor State #{health_monitor_instance_state}" + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: signal timeout should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + # check if the first record has been sent + elsif !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + health_monitor_instance_state.should_send = true + set_state(monitor_instance_id, health_monitor_instance_state) + end + # latest state is different that last sent state + else + #if latest_record_state is none, send + if latest_record_state.downcase == HealthMonitorStates::NONE + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.state_change_time = current_time + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: NONE state should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + # if it is a monitor that needs to instantly notify on state change, update the state + # mark the monitor to be sent + elsif samples_to_check == 1 + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.state_change_time = current_time + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + else + # state change from previous sent state to latest record state + #check state of last n records to see if they are all in the same state + if (is_state_change_consistent(health_monitor_records, samples_to_keep)) + first_record = health_monitor_records[0] + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state + health_monitor_instance_state.is_state_change_consistent = true # This way it wont be recomputed in the optimizer. + health_monitor_instance_state.should_send = true + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.state_change_time = current_time + + set_state(monitor_instance_id, health_monitor_instance_state) + + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + log.debug "#{monitor_instance_id} condition: consistent state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + end + end + end + end + + private + def is_state_change_consistent(health_monitor_records, samples_to_check) + if health_monitor_records.nil? || health_monitor_records.size == 0 || health_monitor_records.size < samples_to_check + return false + end + i = 0 + while i < health_monitor_records.size - 1 + #log.debug "Prev: #{health_monitor_records[i].state} Current: #{health_monitor_records[i + 1].state}" + if health_monitor_records[i]["state"] != health_monitor_records[i + 1]["state"] + return false + end + i += 1 + end + return true + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb new file mode 100644 index 000000000..df47529e6 --- /dev/null +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -0,0 +1,369 @@ +require 'logger' +require 'digest' + +module HealthModel + # static class that provides a bunch of utility methods + class HealthMonitorUtils + + begin + if !Gem.win_platform? + require_relative '../KubernetesApiClient' + end + rescue => e + $log.info "Error loading KubernetesApiClient #{e.message}" + end + + @@node_inventory = [] + + @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log" + + if Gem.win_platform? #unit testing on windows dev machine + @log_path = "C:\Temp\health_monitors.log" + end + + @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@last_refresh_time = '2019-01-01T00:00:00Z' + + class << self + # compute the percentage state given a value and a monitor configuration + def compute_percentage_state(value, config) + + if config.nil? || config['WarnThresholdPercentage'].nil? + warn_percentage = nil + else + warn_percentage = config['WarnThresholdPercentage'].to_f + end + fail_percentage = config['FailThresholdPercentage'].to_f + + if value > fail_percentage + return HealthMonitorStates::FAIL + elsif !warn_percentage.nil? && value > warn_percentage + return HealthMonitorStates::WARNING + else + return HealthMonitorStates::PASS + end + end + + def is_node_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + end + + def is_pods_ready_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + end + + def is_cluster_health_model_enabled + enabled = ENV["AZMON_CLUSTER_ENABLE_HEALTH_MODEL"] + if !enabled.nil? && enabled.casecmp("true") == 0 + return true + else + return false + end + end + + def get_pods_ready_hash(pod_inventory, deployment_inventory) + pods_ready_percentage_hash = {} + deployment_lookup = {} + deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + pod_inventory['items'].each do |pod| + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + controller_name = pod['metadata']['ownerReferences'][0]['name'] + else + owner_kind = pod['kind'] + controller_name = pod['metadata']['name'] + #log.info "#{JSON.pretty_generate(pod)}" + end + + namespace = pod['metadata']['namespace'] + status = pod['status']['phase'] + + workload_name = '' + if owner_kind.nil? + owner_kind = 'Pod' + end + case owner_kind.downcase + when 'job' + # we are excluding jobs + next + when 'replicaset' + # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name + labels = pod['metadata']['labels'].to_h + labels.each {|k,v| + lookup_key = "#{namespace}-#{k}=#{v}" + if deployment_lookup.key?(lookup_key) + workload_name = deployment_lookup[lookup_key] + break + end + } + if workload_name.empty? + workload_name = "#{namespace}~~#{controller_name}" + end + when 'daemonset' + workload_name = "#{namespace}~~#{controller_name}" + else + workload_name = "#{namespace}~~#{pod['metadata']['name']}" + end + + if pods_ready_percentage_hash.key?(workload_name) + total_pods = pods_ready_percentage_hash[workload_name]['totalPods'] + pods_ready = pods_ready_percentage_hash[workload_name]['podsReady'] + else + total_pods = 0 + pods_ready = 0 + end + + total_pods += 1 + if status == 'Running' + pods_ready += 1 + end + + pods_ready_percentage_hash[workload_name] = {'totalPods' => total_pods, 'podsReady' => pods_ready, 'namespace' => namespace, 'workload_name' => workload_name, 'kind' => owner_kind} + rescue => e + log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + end + end + return pods_ready_percentage_hash + end + + def get_node_state_from_node_conditions(node_conditions) + pass = false + node_conditions.each do |condition| + type = condition['type'] + status = condition['status'] + + if ((type == "NetworkUnavailable" || type == "OutOfDisk") && (status == 'True' || status == 'Unknown')) + return "fail" + elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) + return "warn" + elsif type == "Ready" && status == 'True' + pass = true + end + end + + if pass + return "pass" + else + return "fail" + end + end + + def get_resource_subscription(pod_inventory, metric_name, metric_capacity) + subscription = 0.0 + if !pod_inventory.empty? + pod_inventory['items'].each do |pod| + pod['spec']['containers'].each do |container| + if !container['resources']['requests'].nil? && !container['resources']['requests'][metric_name].nil? + subscription += KubernetesApiClient.getMetricNumericValue(metric_name, container['resources']['requests'][metric_name]) + end + end + end + end + #log.debug "#{metric_name} Subscription #{subscription}" + return subscription + end + + def get_cluster_cpu_memory_capacity(log) + begin + node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + cluster_cpu_capacity = 0.0 + cluster_memory_capacity = 0.0 + if !node_inventory.empty? + node_inventory['items'].each do |node| + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? + cpu_capacity_json.each do |cpu_capacity_node| + if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] + end + end + log.info "Cluster CPU Limit #{cluster_cpu_capacity}" + else + log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? + memory_capacity_json.each do |memory_capacity_node| + if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] + end + end + log.info "Cluster Memory Limit #{cluster_memory_capacity}" + else + log.info "Error getting memory_capacity" + end + end + else + log.info "Unable to get cpu and memory capacity" + return [0.0, 0.0] + end + return [cluster_cpu_capacity, cluster_memory_capacity] + rescue => e + log.info e + end + end + + def refresh_kubernetes_api_data(log, hostName, force: false) + #log.debug "refresh_kubernetes_api_data" + if ( ((Time.now.utc - Time.parse(@@last_refresh_time)) / 60 ) < 5.0 && !force) + log.debug "Less than 5 minutes since last refresh at #{@@last_refresh_time}" + return + end + if force + log.debug "Force Refresh" + end + + begin + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + if !hostName.nil? + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=spec.nodeName%3D#{hostName}").body) + else + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) + end + podInventory['items'].each do |pod| + has_owner = !pod['metadata']['ownerReferences'].nil? + if !has_owner + workload_name = pod['metadata']['name'] + else + workload_name = pod['metadata']['ownerReferences'][0]['name'] + end + namespace = pod['metadata']['namespace'] + #TODO: Figure this out for container cpu/memory + #@@controllerMapping[workload_name] = namespace + #log.debug "workload_name #{workload_name} namespace #{namespace}" + pod['spec']['containers'].each do |container| + key = [pod['metadata']['uid'], container['name']].join('/') + + if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['cpu'].nil? + cpu_limit_value = KubernetesApiClient.getMetricNumericValue('cpu', container['resources']['limits']['cpu']) + else + log.info "CPU limit not set for container : #{container['name']}. Using Node Capacity" + #TODO: Send warning health event #bestpractices + cpu_limit_value = @cpu_capacity + end + + if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['memory'].nil? + #log.info "Raw Memory Value #{container['resources']['limits']['memory']}" + memory_limit_value = KubernetesApiClient.getMetricNumericValue('memory', container['resources']['limits']['memory']) + else + log.info "Memory limit not set for container : #{container['name']}. Using Node Capacity" + memory_limit_value = @memory_capacity + end + + #TODO: Figure this out for container cpu/memory + #@@containerMetadata[key] = {"cpuLimit" => cpu_limit_value, "memoryLimit" => memory_limit_value, "controllerName" => workload_name, "namespace" => namespace} + end + end + rescue => e + log.info "Error Refreshing Container Resource Limits #{e.backtrace}" + end + # log.info "Controller Mapping #{@@controllerMapping}" + # log.info "Node Inventory #{@@nodeInventory}" + # log.info "Container Metadata #{@@containerMetadata}" + # log.info "------------------------------------" + @@last_refresh_time = Time.now.utc.iso8601 + end + + def get_monitor_instance_id(monitor_id, args = []) + string_to_hash = args.join("/") + return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}" + end + + def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname) + + log.info "ensure_cpu_memory_capacity_set cpu_capacity #{cpu_capacity} memory_capacity #{memory_capacity}" + if cpu_capacity != 0.0 && memory_capacity != 0.0 + log.info "CPU And Memory Capacity are already set" + return [cpu_capacity, memory_capacity] + end + + begin + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + rescue Exception => e + log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + end + if !@@nodeInventory.nil? + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? + cpu_capacity_json.each do |cpu_info_node| + if !cpu_info_node['DataItems'][0]['Host'].nil? && cpu_info_node['DataItems'][0]['Host'] == hostname + if !cpu_info_node['DataItems'][0]['Collections'][0]['Value'].nil? + cpu_capacity = cpu_info_node['DataItems'][0]['Collections'][0]['Value'] + end + end + end + log.info "CPU Limit #{cpu_capacity}" + else + log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? + memory_capacity_json.each do |memory_info_node| + if !memory_info_node['DataItems'][0]['Host'].nil? && memory_info_node['DataItems'][0]['Host'] == hostname + if !memory_info_node['DataItems'][0]['Collections'][0]['Value'].nil? + memory_capacity = memory_info_node['DataItems'][0]['Collections'][0]['Value'] + end + end + end + log.info "memory Limit #{memory_capacity}" + else + log.info "Error getting memory_capacity" + end + return [cpu_capacity, memory_capacity] + end + end + + def build_metrics_hash(metrics_to_collect) + metrics_to_collect_arr = metrics_to_collect.split(',').map(&:strip) + metrics_hash = metrics_to_collect_arr.map {|x| [x.downcase,true]}.to_h + return metrics_hash + end + + def get_health_monitor_config + health_monitor_config = {} + begin + file = File.open('/opt/microsoft/omsagent/plugin/healthmonitorconfig.json', "r") + if !file.nil? + fileContents = file.read + health_monitor_config = JSON.parse(fileContents) + file.close + end + rescue => e + log.info "Error when opening health config file #{e}" + end + return health_monitor_config + end + + def get_cluster_labels + labels = {} + cluster_id = KubernetesApiClient.getClusterId + region = KubernetesApiClient.getClusterRegion + labels['container.azm.ms/cluster-region'] = region + if !cluster_id.nil? + cluster_id_elements = cluster_id.split('/') + azure_sub_id = cluster_id_elements[2] + resource_group = cluster_id_elements[4] + cluster_name = cluster_id_elements[8] + labels['container.azm.ms/cluster-subscription-id'] = azure_sub_id + labels['container.azm.ms/cluster-resource-group'] = resource_group + labels['container.azm.ms/cluster-name'] = cluster_name + end + return labels + end + + def get_log_handle + return @log + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb new file mode 100644 index 000000000..4cf53e82c --- /dev/null +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -0,0 +1,51 @@ +module HealthModel + # this class + # 1. dedupes daemonset signals and takes only the latest + # 2. removes signals for objects that are no longer in the inventory e.g. node might have sent signal before being scaled down + class HealthSignalReducer + def initialize + + end + + def reduce_signals(health_monitor_records, health_k8s_inventory) + nodes = health_k8s_inventory.get_nodes + workload_names = health_k8s_inventory.get_workload_names + reduced_signals_map = {} + reduced_signals = [] + health_monitor_records.each{|health_monitor_record| + monitor_instance_id = health_monitor_record.monitor_instance_id + monitor_id = health_monitor_record.monitor_id + if reduced_signals_map.key?(monitor_instance_id) + record = reduced_signals_map[monitor_instance_id] + if health_monitor_record.transition_date_time > record.transition_date_time # always take the latest record for a monitor instance id + puts 'Duplicate Daemon Set signal' + reduced_signals_map[monitor_instance_id] = health_monitor_record + end + elsif HealthMonitorHelpers.is_node_monitor(monitor_id) + node_name = health_monitor_record.labels['kubernetes.io/hostname'] + if (node_name.nil? || !nodes.include?(node_name)) # only add daemon set records if node is present in the inventory + next + end + reduced_signals_map[monitor_instance_id] = health_monitor_record + elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor_id) + workload_name = health_monitor_record.labels[HealthMonitorLabels::WORKLOAD_NAME] + namespace = health_monitor_record.labels[HealthMonitorLabels::NAMESPACE] + lookup = "#{namespace}~~#{workload_name}" + if (workload_name.nil? || !workload_names.include?(lookup)) #only add pod record if present in the inventory + next + end + reduced_signals_map[monitor_instance_id] = health_monitor_record + else + reduced_signals_map[monitor_instance_id] = health_monitor_record + end + } + + reduced_signals_map.each{|k,v| + reduced_signals.push(v) + } + + return reduced_signals + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb new file mode 100644 index 000000000..e6ec9d2c3 --- /dev/null +++ b/source/code/plugin/health/monitor_factory.rb @@ -0,0 +1,28 @@ +module HealthModel + class MonitorFactory + + def initialize + + end + + def create_unit_monitor(monitor_record) + return UnitMonitor.new(monitor_record.monitor_id, + monitor_record.monitor_instance_id, + monitor_record.state, + monitor_record.transition_date_time, + monitor_record.labels, + monitor_record.config, + monitor_record.details) + end + + def create_aggregate_monitor(monitor_id, monitor_instance_id, labels, aggregation_algorithm, aggregation_algorithm_params, child_monitor) + return AggregateMonitor.new(monitor_id, + monitor_instance_id, + child_monitor.state, + child_monitor.transition_date_time, + aggregation_algorithm, + aggregation_algorithm_params, + labels) + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/monitor_set.rb b/source/code/plugin/health/monitor_set.rb new file mode 100644 index 000000000..8d5994419 --- /dev/null +++ b/source/code/plugin/health/monitor_set.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module HealthModel + class MonitorSet + attr_accessor :monitors + + #constructor + def initialize + @monitors = {} + end + + # checks if the monitor is present in the set + def contains?(monitor_instance_id) + @monitors.key?(monitor_instance_id) + end + + # adds or updates the monitor + def add_or_update(monitor) + @monitors[monitor.monitor_instance_id] = monitor + end + + # gets the monitor given the monitor instance id + def get_monitor(monitor_instance_id) + @monitors[monitor_instance_id] if @monitors.key?(monitor_instance_id) + end + + # deletes a monitor from the set + def delete(monitor_instance_id) + if @monitors.key?(monitor_instance_id) + @monitors.delete(monitor_instance_id) + end + end + + # gets the size of the monitor set + def get_size + @monitors.length + end + + # gets the map of monitor instance id to monitors + def get_map + @monitors + end + end +end diff --git a/source/code/plugin/health/node_monitor_hierarchy_reducer.rb b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb new file mode 100644 index 000000000..aafbd07a8 --- /dev/null +++ b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module HealthModel + class NodeMonitorHierarchyReducer + def initialize + end + + # Finalizes the Node Hierarchy. This removes node pools and node pool set from the hierarchy if they are not present. + def finalize(monitor_set) + monitors_to_reduce = [MonitorId::ALL_AGENT_NODE_POOLS, MonitorId::ALL_NODES] + # for the above monitors, which are constant per cluster, the monitor_id and monitor_instance_id are the same + monitors_to_reduce.each do |monitor_to_reduce| + monitor = monitor_set.get_monitor(monitor_to_reduce) + if !monitor.nil? + if monitor.is_aggregate_monitor && monitor.get_member_monitors.size == 1 + #copy the children of member monitor as children of parent + member_monitor_instance_id = monitor.get_member_monitors[0] #gets the only member monitor instance id + member_monitor = monitor_set.get_monitor(member_monitor_instance_id) + #reduce only if the aggregation algorithms are the same + if !member_monitor.aggregation_algorithm.nil? && member_monitor.aggregation_algorithm == AggregationAlgorithm::WORSTOF && monitor.aggregation_algorithm == member_monitor.aggregation_algorithm + member_monitor.get_member_monitors.each{|grandchild_monitor| + monitor.add_member_monitor(grandchild_monitor) + } + monitor.remove_member_monitor(member_monitor_instance_id) + # delete the member monitor from the monitor_set + monitor_set.delete(member_monitor_instance_id) + end + end + end + end + end + end +end diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb new file mode 100644 index 000000000..6a27f11d8 --- /dev/null +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -0,0 +1,86 @@ +module HealthModel + class ParentMonitorProvider + + attr_reader :health_model_definition, :parent_monitor_mapping, :parent_monitor_instance_mapping + + def initialize(definition) + @health_model_definition = definition + @parent_monitor_mapping = {} #monitorId --> parent_monitor_id mapping + @parent_monitor_instance_mapping = {} #child monitor id -- > parent monitor instance mapping. Used in instances when the node no longer exists and impossible to compute from kube api results + end + + # gets the parent monitor id given the state transition. It requires the monitor id and labels to determine the parent id + def get_parent_monitor_id(monitor) + monitor_id = monitor.monitor_id + + # cache the parent monitor id so it is not recomputed every time + if @parent_monitor_mapping.key?(monitor.monitor_instance_id) + return @parent_monitor_mapping[monitor.monitor_instance_id] + end + + if @health_model_definition.key?(monitor_id) + parent_monitor_id = @health_model_definition[monitor_id]['parent_monitor_id'] + # check parent_monitor_id is an array, then evaluate the conditions, else return the parent_monitor_id + if parent_monitor_id.is_a?(String) + @parent_monitor_mapping[monitor.monitor_instance_id] = parent_monitor_id + return parent_monitor_id + end + if parent_monitor_id.nil? + conditions = @health_model_definition[monitor_id]['conditions'] + if !conditions.nil? && conditions.is_a?(Array) + labels = monitor.labels + conditions.each{|condition| + left = "#{labels[condition['key']]}" + op = "#{condition['operator']}" + right = "#{condition['value']}" + cond = left.send(op.to_sym, right) + + if cond + @parent_monitor_mapping[monitor.monitor_instance_id] = condition['parent_id'] + return condition['parent_id'] + end + } + end + raise "Conditions were not met to determine the parent monitor id" if monitor_id != MonitorId::CLUSTER + end + else + raise "Invalid Monitor Id #{monitor_id} in get_parent_monitor_id" + end + end + + def get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id) + labels_to_copy = @health_model_definition[monitor_id]['labels'] + if labels_to_copy.nil? + return {} + end + parent_monitor_labels = {} + labels_to_copy.each{|label| + parent_monitor_labels[label] = monitor_labels[label] + } + return parent_monitor_labels + end + + def get_parent_monitor_config(parent_monitor_id) + return @health_model_definition[parent_monitor_id] + end + + def get_parent_monitor_instance_id(monitor_instance_id, parent_monitor_id, parent_monitor_labels) + if @parent_monitor_instance_mapping.key?(monitor_instance_id) + return @parent_monitor_instance_mapping[monitor_instance_id] + end + + labels = AggregateMonitorInstanceIdLabels.get_labels_for(parent_monitor_id) + if !labels.is_a?(Array) + raise "Expected #{labels} to be an Array for #{parent_monitor_id}" + end + values = labels.map{|label| parent_monitor_labels[label]} + if values.nil? || values.empty? || values.size == 0 + @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_id + return parent_monitor_id + end + parent_monitor_instance_id = "#{parent_monitor_id}-#{values.join('-')}" + @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_instance_id + return parent_monitor_instance_id + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb new file mode 100644 index 000000000..9af599321 --- /dev/null +++ b/source/code/plugin/health/unit_monitor.rb @@ -0,0 +1,26 @@ +require_relative 'health_model_constants' +require 'json' + +module HealthModel + class UnitMonitor + + attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :labels, :config, :details, :is_aggregate_monitor + + # constructor + def initialize(monitor_id, monitor_instance_id, state, transition_date_time, labels, config, details) + @monitor_id = monitor_id + @monitor_instance_id = monitor_instance_id + @transition_date_time = transition_date_time + @state = state + @labels = labels + @config = config + @details = details + @is_aggregate_monitor = false + end + + def get_member_monitors + return nil + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index f5f65f01b..1702877a2 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -2,6 +2,7 @@ # frozen_string_literal: true module Fluent + class CAdvisor_Perf_Input < Input Plugin.register_input("cadvisorperf", self) @@ -18,6 +19,8 @@ def initialize config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.api.cadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + config_param :nodehealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Node" + #config_param :containerhealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Container" def configure(conf) super @@ -51,11 +54,14 @@ def enumerate() record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end + #router.emit(@tag, time, record) if record + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream + #router.emit_stream(@containerhealthtag, eventStream) if eventStream + router.emit_stream(@nodehealthtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 3a0e04c67..f177b62bf 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -67,7 +67,7 @@ def enumerate(eventList = nil) newEventQueryState.push(eventId) if !eventQueryState.empty? && eventQueryState.include?(eventId) next - end + end record["ObjectKind"] = items["involvedObject"]["kind"] record["Namespace"] = items["involvedObject"]["namespace"] record["Name"] = items["involvedObject"]["name"] @@ -94,12 +94,12 @@ def enumerate(eventList = nil) eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - end + end writeEventQueryState(newEventQueryState) rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end end def run_periodic diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb new file mode 100644 index 000000000..d9672da3b --- /dev/null +++ b/source/code/plugin/in_kube_health.rb @@ -0,0 +1,307 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "KubernetesApiClient" +require_relative "oms_common" +require_relative "omslog" +require_relative "ApplicationInsightsUtility" + +module Fluent + + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + class KubeHealthInput < Input + Plugin.register_input("kubehealth", self) + + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + + @@clusterCpuCapacity = 0.0 + @@clusterMemoryCapacity = 0.0 + + def initialize + super + require "yaml" + require "json" + + @@cluster_id = KubernetesApiClient.getClusterId + @resources = HealthKubernetesResources.instance + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + end + + include HealthModel + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.KubeHealth.ReplicaSet" + + def configure(conf) + super + end + + def start + begin + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + + @@hmlog = HealthMonitorUtils.get_log_handle + @@clusterName = KubernetesApiClient.getClusterName + @@clusterRegion = KubernetesApiClient.getClusterRegion + cluster_capacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog) + @@clusterCpuCapacity = cluster_capacity[0] + @@clusterMemoryCapacity = cluster_capacity[1] + @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}" + if @@cluster_health_model_enabled + ApplicationInsightsUtility.sendCustomEvent("in_kube_health Plugin Start", {}) + end + end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + if !@@cluster_health_model_enabled + @@hmlog.info "Cluster Health Model disabled in in_kube_health" + return + end + + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + health_monitor_records = [] + eventStream = MultiEventStream.new + + #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) + # we do this so that if the call fails, we get a response code/header etc. + node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") + node_inventory = JSON.parse(node_inventory_response.body) + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") + pod_inventory = JSON.parse(pod_inventory_response.body) + deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + + @resources.node_inventory = node_inventory + @resources.pod_inventory = pod_inventory + @resources.deployment_inventory = deployment_inventory + + if node_inventory_response.code.to_i != 200 + record = process_kube_api_up_monitor("fail", node_inventory_response) + health_monitor_records.push(record) if record + else + record = process_kube_api_up_monitor("pass", node_inventory_response) + health_monitor_records.push(record) if record + end + + if !pod_inventory.nil? + record = process_cpu_oversubscribed_monitor(pod_inventory) + health_monitor_records.push(record) if record + record = process_memory_oversubscribed_monitor(pod_inventory) + health_monitor_records.push(record) if record + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory) + + system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} + workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} + + system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + system_pods_ready_percentage_records.each do |record| + health_monitor_records.push(record) if record + end + + workload_pods_ready_percentage_records = process_pods_ready_percentage(workload_pods, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID) + workload_pods_ready_percentage_records.each do |record| + health_monitor_records.push(record) if record + end + else + hmlog.info "POD INVENTORY IS NIL" + end + + if !node_inventory.nil? + node_condition_records = process_node_condition_monitor(node_inventory) + node_condition_records.each do |record| + health_monitor_records.push(record) if record + end + else + hmlog.info "NODE INVENTORY IS NIL" + end + + health_monitor_records.each do |record| + eventStream.add(emitTime, record) + end + router.emit_stream(@tag, eventStream) if eventStream + rescue => errorStr + @@hmlog.warn("error in_kube_health: #{errorStr.to_s}") + @@hmlog.debug "backtrace Input #{errorStr.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def process_cpu_oversubscribed_monitor(pod_inventory) + timestamp = Time.now.utc.iso8601 + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"cpu", @@clusterCpuCapacity) + state = subscription > @@clusterCpuCapacity ? "fail" : "pass" + #@@hmlog.debug "CPU Oversubscribed Monitor State : #{state}" + + #CPU + monitor_id = HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity/1000000.to_f, "clusterCpuRequests" => subscription/1000000.to_f}} + # @@hmlog.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id]) + #hmlog.info "Monitor Instance Id: #{monitor_instance_id}" + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_cpu_oversubscribed_monitor" + return health_record + end + + def process_memory_oversubscribed_monitor(pod_inventory) + timestamp = Time.now.utc.iso8601 + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"memory", @@clusterMemoryCapacity) + state = subscription > @@clusterMemoryCapacity ? "fail" : "pass" + #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}" + + #CPU + monitor_id = HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterMemoryCapacity" => @@clusterMemoryCapacity.to_f, "clusterMemoryRequests" => subscription.to_f}} + hmlog = HealthMonitorUtils.get_log_handle + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_memory_oversubscribed_monitor" + return health_record + end + + def process_kube_api_up_monitor(state, response) + timestamp = Time.now.utc.iso8601 + + monitor_id = HealthMonitorConstants::KUBE_API_STATUS + details = response.each_header.to_h + details['ResponseCode'] = response.code + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + hmlog = HealthMonitorUtils.get_log_handle + #hmlog.info health_monitor_record + + monitor_instance_id = HealthMonitorConstants::KUBE_API_STATUS + #hmlog.info "Monitor Instance Id: #{monitor_instance_id}" + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_kube_api_up_monitor" + return health_record + end + + def process_pods_ready_percentage(pods_hash, config_monitor_id) + monitor_config = @provider.get_config(config_monitor_id) + hmlog = HealthMonitorUtils.get_log_handle + + records = [] + pods_hash.keys.each do |key| + workload_name = key + total_pods = pods_hash[workload_name]['totalPods'] + pods_ready = pods_hash[workload_name]['podsReady'] + namespace = pods_hash[workload_name]['namespace'] + workload_kind = pods_hash[workload_name]['kind'] + percent = pods_ready / total_pods * 100 + timestamp = Time.now.utc.iso8601 + + state = HealthMonitorUtils.compute_percentage_state((100-percent), monitor_config) + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workloadName" => workload_name, "namespace" => namespace, "workloadKind" => workload_kind}} + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + records.push(health_record) + end + #@@hmlog.info "Successfully processed pods_ready_percentage for #{config_monitor_id} #{records.size}" + return records + end + + def process_node_condition_monitor(node_inventory) + monitor_id = HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + timestamp = Time.now.utc.iso8601 + monitor_config = @provider.get_config(monitor_id) + node_condition_monitor_records = [] + if !node_inventory.nil? + node_inventory['items'].each do |node| + node_name = node['metadata']['name'] + conditions = node['status']['conditions'] + state = HealthMonitorUtils.get_node_state_from_node_conditions(conditions) + #hmlog.debug "Node Name = #{node_name} State = #{state}" + details = {} + conditions.each do |condition| + details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message']} + end + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + health_record[HealthMonitorRecordFields::NODE_NAME] = node_name + node_condition_monitor_records.push(health_record) + end + end + #@@hmlog.info "Successfully processed process_node_condition_monitor #{node_condition_monitor_records.size}" + return node_condition_monitor_records + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + @@hmlog.info("in_kube_health::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + @@hmlog.warn "in_kube_health::run_periodic: enumerate Failed for kubeapi sourced data health: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + end +end diff --git a/test/code/plugin/filter_health_model_builder_test.rb b/test/code/plugin/filter_health_model_builder_test.rb new file mode 100644 index 000000000..f4dba11ed --- /dev/null +++ b/test/code/plugin/filter_health_model_builder_test.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'test/unit' +require 'json' +# require_relative '../../../source/code/plugin/health' + +Dir[File.join(__dir__, '../../../source/code/plugin/health', '*.rb')].each { |file| require file } + +class FilterHealthModelBuilderTest < Test::Unit::TestCase + include HealthModel + + def test_event_stream + health_definition_path = 'C:\AzureMonitor\ContainerInsights\Docker-Provider\installer\conf\health_model_definition.json' + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + i = 1 + loop do + mock_data_path = "C:/AzureMonitor/ContainerInsights/Docker-Provider/source/code/plugin/mock_data-#{i}.json" + file = File.read(mock_data_path) + data = JSON.parse(file) + + health_monitor_records = [] + data.each do |record| + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + record[HealthMonitorRecordFields::MONITOR_LABELS], + record[HealthMonitorRecordFields::MONITOR_CONFIG], + record[HealthMonitorRecordFields::DETAILS] + ) + state_transitions.push(state_transition) + end + + model_builder.process_state_transitions(state_transitions) + changed_monitors = model_builder.finalize_model + changed_monitors.keys.each{|key| + puts key + } + i = i + 1 + if i == 6 + break + end + end + puts "Done" + end +end diff --git a/test/code/plugin/health/aggregate_monitor_spec.rb b/test/code/plugin/health/aggregate_monitor_spec.rb new file mode 100644 index 000000000..729965999 --- /dev/null +++ b/test/code/plugin/health/aggregate_monitor_spec.rb @@ -0,0 +1,256 @@ +require_relative '../test_helpers' + +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "AggregateMonitor Spec" do + it "is_aggregate_monitor is true for AggregateMonitor" do + # Arrange/Act + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + # Assert + assert_equal monitor.is_aggregate_monitor, true + end + + it "add_member_monitor tests -- adds a member monitor as a child monitor" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + #Act + monitor.add_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.include?("child_monitor_1"), true + + #Act + monitor.add_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + end + + it "remove_member_monitor tests -- removes a member monitor as a child monitor" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + + #Act + monitor.remove_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + + #Act + monitor.remove_member_monitor("unknown_child") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + end + + it "calculate_details tests -- calculates rollup details based on member monitor states" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + + #Act + monitor.calculate_details(monitor_set) + #Assert + assert_equal monitor.details["details"], {"pass"=>["child_monitor_1"], "fail"=>["child_monitor_2"]} + + #Arrange + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_3) + monitor.add_member_monitor("child_monitor_3") + + #Act + monitor.calculate_details(monitor_set) + #Assert + assert_equal monitor.details["details"], {"pass"=>["child_monitor_1", "child_monitor_3"], "fail"=>["child_monitor_2"]} + end + + it "calculate_state tests -- raises when right aggregation_algorithm NOT specified" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "", [], {}) + #Assert + assert_raises do + monitor.calculate_state(monitor_set) + end + end + + it "calculate_state tests -- calculate_worst_of_state " do + # Arrange -- pass, fail = fail + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange -- pass, pass = pass + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_2) + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + #Arrange -- pass, warn = warn + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "warn", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_2) + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "warn" + + #Arrange -- warn, fail = fail + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange -- warn, unknown = unknown + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "warn" + + #Arrange -- pass, unknown = unknown + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "unknown" + end + + it "calculate_state tests -- calculate_percentage_state " do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + #Arrange -- single child monitor + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 33.3}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor.add_member_monitor("child_monitor_1") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + + #Arrange -- remove none state + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :none, :time, "percentage", {"state_threshold" => 100.0}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "none", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + monitor_set.add_or_update(child_monitor_3) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + monitor.add_member_monitor("child_monitor_3") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {}) + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + monitor_set.add_or_update(child_monitor_3) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + monitor.add_member_monitor("child_monitor_3") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + end +end \ No newline at end of file diff --git a/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb new file mode 100644 index 000000000..f1ae0564d --- /dev/null +++ b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb @@ -0,0 +1,59 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "AggregateMonitorStateFinalizer spec" do + it 'computes the right state and details' do + #arrange + monitor_set = Mock.new + + #mock unit monitors + child1 = Mock.new + def child1.state; "pass"; end + def child1.monitor_id; "child1";end + def child1.monitor_instance_id; "child1"; end + def child1.nil?; false; end + def child1.is_aggregate_monitor; false; end + + child2 = Mock.new + def child2.state; "fail"; end + def child2.monitor_id; "child2";end + def child2.monitor_instance_id; "child2"; end + def child2.nil?; false; end + def child2.is_aggregate_monitor; false; end + + parent_monitor = AggregateMonitor.new("parent_monitor", "parent_monitor", :none, :time, "worstOf", nil, {}) + parent_monitor.add_member_monitor("child1") + parent_monitor.add_member_monitor("child2") + + top_level_monitor = AggregateMonitor.new("cluster", "cluster", :none, :time, "worstOf", nil, {}) + top_level_monitor.add_member_monitor("parent_monitor") + + monitor_set.expect(:get_map, {"cluster" => top_level_monitor, "parent_monitor" => parent_monitor, "child1" => child1, "child2" => child2}) + monitor_set.expect(:get_monitor, top_level_monitor, ["cluster"]) + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + + + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + + #act + finalizer = AggregateMonitorStateFinalizer.new + finalizer.finalize(monitor_set) + #assert + + assert_equal parent_monitor.state, "fail" + assert_equal parent_monitor.details, {"details"=>{"pass"=>["child1"], "fail"=>["child2"]}, "state"=>"fail", "timestamp"=>:time} + + assert_equal top_level_monitor.state, "fail" + assert_equal top_level_monitor.details, {"details"=>{"fail"=>["parent_monitor"]}, "state"=>"fail", "timestamp"=>:time} + + end +end \ No newline at end of file diff --git a/test/code/plugin/health/ca.crt b/test/code/plugin/health/ca.crt new file mode 100644 index 000000000..9daeafb98 --- /dev/null +++ b/test/code/plugin/health/ca.crt @@ -0,0 +1 @@ +test diff --git a/test/code/plugin/health/cluster_health_state_spec.rb b/test/code/plugin/health/cluster_health_state_spec.rb new file mode 100644 index 000000000..897291fe2 --- /dev/null +++ b/test/code/plugin/health/cluster_health_state_spec.rb @@ -0,0 +1,37 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +require 'time' +include HealthModel +include Minitest + +describe "Cluster Health State Spec" do + + it "ClusterHealthState.new throws if cert file is NOT present" do + state = { + "m1" => { + "state" => "pass", + "time" => Time.now.utc.iso8601 + } + } + + token_file_path = 'token' + cert_file_path = '/var/ca.crt' + + proc {ClusterHealthState.new(token_file_path, cert_file_path)}.must_raise + + end + + it "ClusterHealthState.new returns nil if token is NOT present" do + state = { + "m1" => { + "state" => "pass", + "time" => Time.now.utc.iso8601 + } + } + token_file_path = 'token' + cert_file_path = File.join(File.expand_path(File.dirname(__FILE__)), "ca.crt") + + chs = ClusterHealthState.new(token_file_path, cert_file_path) + chs.token.must_be_nil + end +end diff --git a/test/code/plugin/health/health_hierarchy_builder_spec.rb b/test/code/plugin/health/health_hierarchy_builder_spec.rb new file mode 100644 index 000000000..daafe0312 --- /dev/null +++ b/test/code/plugin/health/health_hierarchy_builder_spec.rb @@ -0,0 +1,11 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthHierarchyBuilder spec" do + it 'builds right hierarchy given a child monitor and a parent monitor provider' do + + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb new file mode 100644 index 000000000..c27d969ec --- /dev/null +++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb @@ -0,0 +1,222 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "HealthKubernetesResources spec" do + it "returns the right set of nodes and workloads given node and pod inventory" do + + #arrange + nodes_json = '{ + "items": [ + { + "metadata": { + "name": "aks-nodepool1-19574989-0" + } + }, + { + "metadata": { + "name": "aks-nodepool1-19574989-1" + } + } + ] + }' + + pods_json = '{ + "items": [ + { + "metadata": { + "name": "diliprdeploymentnodeapps-c4fdfb446-mzcsr", + "generateName": "diliprdeploymentnodeapps-c4fdfb446-", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/diliprdeploymentnodeapps-c4fdfb446-mzcsr", + "uid": "ee31a9ce-526e-11e9-a899-6a5520730c61", + "resourceVersion": "4597573", + "creationTimestamp": "2019-03-29T22:06:40Z", + "labels": { + "app": "diliprsnodeapppod", + "diliprPodLabel1": "p1", + "diliprPodLabel2": "p2", + "pod-template-hash": "709896002" + }, + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "kind": "ReplicaSet", + "name": "diliprdeploymentnodeapps-c4fdfb446", + "uid": "ee1e78e0-526e-11e9-a899-6a5520730c61", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "pi-m8ccw", + "generateName": "pi-", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/pi-m8ccw", + "uid": "9fb16aaa-7ccc-11e9-8d23-32c49ee6f300", + "resourceVersion": "7940877", + "creationTimestamp": "2019-05-22T20:03:10Z", + "labels": { + "controller-uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300", + "job-name": "pi" + }, + "ownerReferences": [ + { + "apiVersion": "batch/v1", + "kind": "Job", + "name": "pi", + "uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "rss-site", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/rss-site", + "uid": "68a34ea4-7ce4-11e9-8d23-32c49ee6f300", + "resourceVersion": "7954135", + "creationTimestamp": "2019-05-22T22:53:26Z", + "labels": { + "app": "web" + }, + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"metadata\":{\"annotations\":{},\"labels\":{\"app\":\"web\"},\"name\":\"rss-site\",\"namespace\":\"default\"},\"spec\":{\"containers\":[{\"image\":\"nginx\",\"name\":\"front-end\",\"ports\":[{\"containerPort\":80}]},{\"image\":\"nickchase/rss-php-nginx:v1\",\"name\":\"rss-reader\",\"ports\":[{\"containerPort\":88}]}]}}\n" + } + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "kube-proxy-4hjws", + "generateName": "kube-proxy-", + "namespace": "kube-system", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-4hjws", + "uid": "8cf7c410-88f4-11e9-b1b0-5eb4a3e9de7d", + "resourceVersion": "9661065", + "creationTimestamp": "2019-06-07T07:19:12Z", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "1271944371", + "pod-template-generation": "16", + "tier": "node" + }, + "annotations": { + "aks.microsoft.com/release-time": "seconds:1559735217 nanos:797729016 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "7" + }, + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + } + ] + }' + deployments_json = '{ + "items": [ + { + "metadata": { + "name": "diliprdeploymentnodeapps", + "namespace": "default", + "selfLink": "/apis/extensions/v1beta1/namespaces/default/deployments/diliprdeploymentnodeapps", + "uid": "ee1b111d-526e-11e9-a899-6a5520730c61", + "resourceVersion": "4597575", + "generation": 1, + "creationTimestamp": "2019-03-29T22:06:40Z", + "labels": { + "diliprdeploymentLabel1": "d1", + "diliprdeploymentLabel2": "d2" + }, + "annotations": { + "deployment.kubernetes.io/revision": "1", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"diliprdeploymentLabel1\":\"d1\",\"diliprdeploymentLabel2\":\"d2\"},\"name\":\"diliprdeploymentnodeapps\",\"namespace\":\"default\"},\"spec\":{\"replicas\":1,\"selector\":{\"matchLabels\":{\"app\":\"diliprsnodeapppod\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"diliprsnodeapppod\",\"diliprPodLabel1\":\"p1\",\"diliprPodLabel2\":\"p2\"}},\"spec\":{\"containers\":[{\"image\":\"rdilip83/logeverysecond:v2\",\"name\":\"diliprcontainerhelloapp\"}]}}}}\n" + } + }, + "spec": { + "replicas": 1, + "selector": { + "matchLabels": { + "app": "diliprsnodeapppod" + } + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "app": "diliprsnodeapppod", + "diliprPodLabel1": "p1", + "diliprPodLabel2": "p2" + } + }, + "spec": { + "containers": [ + { + "name": "diliprcontainerhelloapp", + "image": "rdilip83/logeverysecond:v2", + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "imagePullPolicy": "IfNotPresent" + } + ], + "restartPolicy": "Always", + "terminationGracePeriodSeconds": 30, + "dnsPolicy": "ClusterFirst", + "securityContext": {}, + "schedulerName": "default-scheduler" + } + }, + "strategy": { + "type": "RollingUpdate", + "rollingUpdate": { + "maxUnavailable": "25%", + "maxSurge": "25%" + } + }, + "revisionHistoryLimit": 2, + "progressDeadlineSeconds": 600 + }, + "apiVersion": "extensions/v1beta1", + "kind": "Deployment" + } + ] + }' + nodes = JSON.parse(nodes_json) + pods = JSON.parse(pods_json) + deployments = JSON.parse(deployments_json) + resources = HealthKubernetesResources.instance + resources.node_inventory = nodes + resources.pod_inventory = pods + resources.deployment_inventory = deployments + #act + parsed_nodes = resources.get_nodes + parsed_workloads = resources.get_workload_names + + #assert + assert_equal parsed_nodes.size, 2 + assert_equal parsed_workloads.size, 3 + + assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1'] + assert_equal parsed_workloads, ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'] + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_missing_signal_generator_spec.rb b/test/code/plugin/health/health_missing_signal_generator_spec.rb new file mode 100644 index 000000000..98d65416d --- /dev/null +++ b/test/code/plugin/health/health_missing_signal_generator_spec.rb @@ -0,0 +1,79 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each {|file| require file} +include HealthModel +include Minitest + +describe "HealthMissingSignalGenerator spec" do + it 'generates missing node signals' do + #arrange + resources = Mock.new + resources.expect(:get_nodes, ["node1"]) + resources.expect(:get_workload_names, ["default~~workload1"]) + + provider = Mock.new + provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"]) + + node1_cpu_record = Mock.new + def node1_cpu_record.monitor_id; "node_cpu_utilization"; end + def node1_cpu_record.monitor_instance_id; "node_cpu_utilization"; end + def node1_cpu_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_cpu_record.config; {}; end + def node1_cpu_record.state; "pass"; end + + node1_memory_record = Mock.new + def node1_memory_record.monitor_id; "node_memory_utilization"; end + def node1_memory_record.monitor_instance_id; "node_memory_utilization"; end + def node1_memory_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_memory_record.config; {}; end + def node1_memory_record.state; "pass"; end + + node1_condition_record = Mock.new + def node1_condition_record.monitor_id; "node_condition"; end + def node1_condition_record.monitor_instance_id; "node_condition-0c593682737a955dc8e0947ad12754fe"; end + def node1_condition_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_condition_record.config; {}; end + def node1_condition_record.state; "pass"; end + + + workload1_pods_ready_record = Mock.new + def workload1_pods_ready_record.monitor_id; "user_workload_pods_ready"; end + def workload1_pods_ready_record.monitor_instance_id; "user_workload_pods_ready-workload1"; end + def workload1_pods_ready_record.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end + def workload1_pods_ready_record.config; {}; end + def workload1_pods_ready_record.state; "pass"; end + + generator = HealthMissingSignalGenerator.new + generator.update_last_received_records([node1_cpu_record, node1_memory_record, node1_condition_record, workload1_pods_ready_record]) + + #act + missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider) + + #assert + assert_equal missing.size, 2 + + assert_equal missing[0].monitor_id, "node_condition" + assert_equal missing[0].state, "unknown" + assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe" + + assert_equal missing[1].monitor_id, "user_workload_pods_ready" + assert_equal missing[1].state, "unknown" + assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1" + + #arrange + resources.expect(:get_nodes, ["node1"]) + resources.expect(:get_workload_names, ["default~~workload1"]) + provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"]) + generator.update_last_received_records([node1_cpu_record, node1_memory_record]) + #act + missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider) + #assert + assert_equal missing.size, 2 + assert_equal missing[0].monitor_id, "node_condition" + assert_equal missing[0].state, "unknown" + assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe" + + assert_equal missing[1].monitor_id, "user_workload_pods_ready" + assert_equal missing[1].state, "none" + assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1" + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_buffer_spec.rb b/test/code/plugin/health/health_model_buffer_spec.rb new file mode 100644 index 000000000..259513c08 --- /dev/null +++ b/test/code/plugin/health/health_model_buffer_spec.rb @@ -0,0 +1,25 @@ +require_relative '../../../../source/code/plugin/health/health_model_buffer' +require_relative '../test_helpers' + +include HealthModel + +describe "HealthModelBuffer Spec" do + it "get_buffer returns the correct buffer data" do + # Arrange + buffer = HealthModelBuffer.new + # Act + buffer.add_to_buffer(['mockRecord']) + # Assert + assert_equal buffer.get_buffer.length, 1 + + #Act + buffer.add_to_buffer(['mockRecord1', 'mockRecord2']) + #Assert + assert_equal buffer.get_buffer.length, 3 + + #Act + buffer.reset_buffer + #Assert + assert_equal buffer.get_buffer.length, 0 + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_spec.rb b/test/code/plugin/health/health_model_builder_spec.rb new file mode 100644 index 000000000..c49e6c92a --- /dev/null +++ b/test/code/plugin/health/health_model_builder_spec.rb @@ -0,0 +1,37 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthModelBuilder spec" do + it "Verify hierarchy builder and finalizer public methods are called" do + #arrange + mock_hierarchy_builder = Mock::new + health_record = Mock::new + mock_monitor_set = Mock::new + mock_state_finalizer = Mock::new + mock_hierarchy_builder.expect(:process_record, nil, [health_record, mock_monitor_set]) + mock_state_finalizer.expect(:finalize, {}, [mock_monitor_set]) + def mock_monitor_set.get_map; {}; end + + #act + builder = HealthModelBuilder.new(mock_hierarchy_builder, [mock_state_finalizer], mock_monitor_set) + builder.process_records([health_record]) + builder.finalize_model + #assert + assert mock_hierarchy_builder.verify + assert mock_state_finalizer.verify + end + + it "Verify finalize_model raises if state_finalizers is empty" do + #arrange + mock_hierarchy_builder = Mock.new + mock_monitor_set = Mock.new + builder = HealthModelBuilder.new(mock_hierarchy_builder, [], mock_monitor_set) + #act and assert + assert_raises do + builder.finalize_model + end + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb new file mode 100644 index 000000000..df921049c --- /dev/null +++ b/test/code/plugin/health/health_model_builder_test.rb @@ -0,0 +1,337 @@ +require 'test/unit' +require 'json' +# require_relative '../../../source/code/plugin/health' + +Dir[File.join(__dir__, '../../../../source/code/plugin/health', '*.rb')].each { |file| require file } + +class FilterHealthModelBuilderTest < Test::Unit::TestCase + include HealthModel + + def test_event_stream + #setup + health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + nodes_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + } + + pods_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + } + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + #test + state = HealthMonitorState.new() + generator = HealthMissingSignalGenerator.new + + for scenario in ["first", "second", "third"] + mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") + file = File.read(mock_data_path) + records = JSON.parse(file) + + node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) + pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) + deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) + resources = HealthKubernetesResources.instance + resources.node_inventory = node_inventory + resources.pod_inventory = pod_inventory + resources.deployment_inventory = deployment_inventory + + workload_names = resources.get_workload_names + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../..//installer/conf/healthmonitorconfig.json")) + + health_monitor_records = [] + records.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + provider.get_labels(record), + provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + state.update_state(health_monitor_record, + provider.get_config(health_monitor_record.monitor_id) + ) + + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + health_monitor_record.state = state.get_state(monitor_instance_id).new_state + health_monitor_records.push(health_monitor_record) + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + + #handle kube api down + kube_api_down_handler = HealthKubeApiDownHandler.new + health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + reducer = HealthSignalReducer.new() + reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + cluster_id = 'fake_cluster_id' + + #get the list of 'none' and 'unknown' signals + missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + #update state for missing signals + missing_signals.each{|signal| + state.update_state(signal, + provider.get_config(signal.monitor_id) + ) + } + generator.update_last_received_records(reduced_records) + reduced_records.push(*missing_signals) + + # build the health model + all_records = reduced_records + model_builder.process_records(all_records) + all_monitors = model_builder.finalize_model + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + state.update_state(monitor, + provider.get_config(monitor.monitor_id) + ) + end + + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + records_to_send = [] + all_monitors.keys.each{|key| + record = provider.get_record(all_monitors[key], state) + #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + } + + if scenario == "first" + assert_equal 50, all_monitors.size + elsif scenario == "second" + assert_equal 34, all_monitors.size + elsif scenario == "third" + assert_equal 5, all_monitors.size + end + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + serializer.serialize(state) + + deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + deserialized_state = deserializer.deserialize + + after_state = HealthMonitorState.new + after_state.initialize_state(deserialized_state) + end + end + + def test_event_stream_aks_engine + + #setup + health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + nodes_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + } + + pods_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + } + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', + 'container.azm.ms/cluster-name' => 'aks-engine-health' + } + + cluster_id = 'fake_cluster_id' + + #test + state = HealthMonitorState.new() + generator = HealthMissingSignalGenerator.new + + for scenario in 1..3 + mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") + file = File.read(mock_data_path) + records = JSON.parse(file) + + node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) + pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) + deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) + resources = HealthKubernetesResources.instance + resources.node_inventory = node_inventory + resources.pod_inventory = pod_inventory + resources.deployment_inventory = deployment_inventory + + workload_names = resources.get_workload_names + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + health_monitor_records = [] + records.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + provider.get_labels(record), + provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + state.update_state(health_monitor_record, + provider.get_config(health_monitor_record.monitor_id) + ) + + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + health_monitor_record.state = state.get_state(monitor_instance_id).new_state + health_monitor_records.push(health_monitor_record) + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + + #handle kube api down + kube_api_down_handler = HealthKubeApiDownHandler.new + health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + reducer = HealthSignalReducer.new() + reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + cluster_id = 'fake_cluster_id' + + #get the list of 'none' and 'unknown' signals + missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + #update state for missing signals + missing_signals.each{|signal| + state.update_state(signal, + provider.get_config(signal.monitor_id) + ) + } + generator.update_last_received_records(reduced_records) + reduced_records.push(*missing_signals) + + # build the health model + all_records = reduced_records + model_builder.process_records(all_records) + all_monitors = model_builder.finalize_model + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + state.update_state(monitor, + provider.get_config(monitor.monitor_id) + ) + end + + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + records_to_send = [] + all_monitors.keys.each{|key| + record = provider.get_record(all_monitors[key], state) + #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + } + + if scenario == 1 + assert_equal 58, all_monitors.size + elsif scenario == 2 + assert_equal 37, all_monitors.size + elsif scenario == 3 + assert_equal 6, all_monitors.size + end + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + serializer.serialize(state) + + deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + deserialized_state = deserializer.deserialize + + after_state = HealthMonitorState.new + after_state.initialize_state(deserialized_state) + end + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_definition_parser_spec.rb b/test/code/plugin/health/health_model_definition_parser_spec.rb new file mode 100644 index 000000000..56551510b --- /dev/null +++ b/test/code/plugin/health/health_model_definition_parser_spec.rb @@ -0,0 +1,24 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "HealthModelDefinitionParser spec " do + it "parses the definition file correctly with the right conditions" do + #arrange + + parser = HealthModelDefinitionParser.new(File.join(File.expand_path(File.dirname(__FILE__)), 'test_health_model_definition.json')) + #act + model_definition = parser.parse_file + + #assert + assert_equal model_definition['conditional_monitor_id'].key?("conditions"), true + assert_equal model_definition['conditional_monitor_id']["conditions"].size, 2 + assert_equal model_definition['conditional_monitor_id'].key?("parent_monitor_id"), false + + #assert + assert_equal model_definition['monitor_id'].key?("conditions"), false + assert_equal model_definition['monitor_id'].key?("parent_monitor_id"), true + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_monitor_state_spec.rb b/test/code/plugin/health/health_monitor_state_spec.rb new file mode 100644 index 000000000..5fa8a6c6e --- /dev/null +++ b/test/code/plugin/health/health_monitor_state_spec.rb @@ -0,0 +1,176 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthMonitorState spec" do + it 'updates should_send to true for monitors which hasnt been sent before' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + end + + it 'updates should_send to true for monitors which need no consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "pass" + monitor_state.new_state.must_equal "fail" + end + + it 'updates should_send to false for monitors which need consistent state change and has no consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + end + + it 'updates should_send to true for monitors which need consistent state change and has a consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + + #act + state.update_state(mock_monitor, config) + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "fail" + end + + it 'updates should_send to false for monitors which need consistent state change and has NO state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "none" + + + #arrange + def mock_monitor.state; "pass"; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + + #act + state.update_state(mock_monitor, config) + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_signal_reducer_spec.rb b/test/code/plugin/health/health_signal_reducer_spec.rb new file mode 100644 index 000000000..f71a5c509 --- /dev/null +++ b/test/code/plugin/health/health_signal_reducer_spec.rb @@ -0,0 +1,96 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthSignalReducer spec" do + it "returns the right set of records -- no reduction" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + inventory = Mock.new + def inventory.get_nodes; ["node1"]; end + def inventory.get_workload_names; []; end + reducer = HealthSignalReducer.new + #act + reduced = reducer.reduce_signals([record1], inventory) + #Assert + assert_equal reduced.size, 1 + end + + it "returns only the latest record if multiple records are present for the same monitor" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + + + record2 = Mock.new + def record2.monitor_id; "node_cpu_utilization"; end + def record2.monitor_instance_id; "node_cpu_utilization-node1"; end + def record2.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def record2.transition_date_time; "#{Time.now.utc.iso8601}" ; end + + inventory = Mock.new + def inventory.get_nodes; ["node1"]; end + def inventory.get_workload_names; []; end + reducer = HealthSignalReducer.new + #act + reduced = reducer.reduce_signals([record1, record2], inventory) + #Assert + assert_equal reduced.size, 1 + end + + it "returns only those records if the node is present in the inventory" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + inventory = Mock.new + def inventory.get_nodes; ["node2"]; end + def inventory.get_workload_names; []; end + + #act + reducer = HealthSignalReducer.new + #assert + assert_equal reducer.reduce_signals([record1], inventory).size, 0 + end + + it "returns only those records if the workdload name is present in the inventory" do + #arrange + record1 = Mock.new + def record1.monitor_id; "user_workload_pods_ready"; end + def record1.monitor_instance_id; "user_workload_pods_ready-workload1"; end + def record1.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + + inventory = Mock.new + def inventory.get_nodes; ["node2"]; end + def inventory.get_workload_names; ["default~~workload1"]; end + reducer = HealthSignalReducer.new + + #act + reduced = reducer.reduce_signals([record1], inventory) + + #assert + assert_equal reduced.size, 1 + + #arrange + record2 = Mock.new + def record2.monitor_id; "user_workload_pods_ready"; end + def record2.monitor_instance_id; "user_workload_pods_ready-workload2"; end + def record2.labels; {HealthMonitorLabels::NAMESPACE => "default1", HealthMonitorLabels::WORKLOAD_NAME => "workload2"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + #act + reduced = reducer.reduce_signals([record1, record2], inventory) + #assert + assert_equal reduced.size, 1 + end + +end diff --git a/test/code/plugin/health/kube_api_down_handler_spec.rb b/test/code/plugin/health/kube_api_down_handler_spec.rb new file mode 100644 index 000000000..3f3f9b37f --- /dev/null +++ b/test/code/plugin/health/kube_api_down_handler_spec.rb @@ -0,0 +1,26 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "KubeApiDownHandler spec" do + it "updates states for monitors in monitors_to_change" do + #arrange + record1 = HealthMonitorRecord.new("node_condition", "node_condition-node1", Time.now.utc.iso8601, "pass", {}, {}, {}) + record2 = HealthMonitorRecord.new("kube_api_status", "kube_api_status", Time.now.utc.iso8601, "fail", {}, {}, {}) + record3 = HealthMonitorRecord.new("user_workload_pods_ready", "user_workload_pods_ready-workload1", Time.now.utc.iso8601, "pass", {}, {}, {}) + record4 = HealthMonitorRecord.new("system_workload_pods_ready", "system_workload_pods_ready-workload2", Time.now.utc.iso8601, "pass", {}, {}, {}) + record5 = HealthMonitorRecord.new("subscribed_capacity_cpu", "subscribed_capacity_cpu", Time.now.utc.iso8601, "pass", {}, {}, {}) + record6 = HealthMonitorRecord.new("subscribed_capacity_memory", "subscribed_capacity_memory", Time.now.utc.iso8601, "pass", {}, {}, {}) + handler = HealthKubeApiDownHandler.new + + #act + handler.handle_kube_api_down([record1, record2, record3, record4, record5, record6]) + #assert + assert_equal record1.state, HealthMonitorStates::UNKNOWN + assert_equal record3.state, HealthMonitorStates::UNKNOWN + assert_equal record4.state, HealthMonitorStates::UNKNOWN + assert_equal record5.state, HealthMonitorStates::UNKNOWN + assert_equal record6.state, HealthMonitorStates::UNKNOWN + + end +end diff --git a/test/code/plugin/health/monitor_factory_spec.rb b/test/code/plugin/health/monitor_factory_spec.rb new file mode 100644 index 000000000..2135808bd --- /dev/null +++ b/test/code/plugin/health/monitor_factory_spec.rb @@ -0,0 +1,28 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "MonitorFactory Spec" do + it "returns UnitMonitor for create_unit_monitor" do + #Arrange + factory = MonitorFactory.new() + monitor_record = HealthMonitorRecord.new(:monitor_id, :monitor_instance_id, :time, :pass, {}, {}, {}) + #act + monitor = factory.create_unit_monitor(monitor_record) + # assert + monitor.must_be_kind_of(UnitMonitor) + end + + it "returns AggregateMonitor for create_aggregate_monitor" do + #arrange + factory = MonitorFactory.new() + mock = Minitest::Mock.new + def mock.state; :pass; end + def mock.transition_date_time; :time; end + #act + monitor = factory.create_aggregate_monitor(:monitor_id, :monitor_instance_id, :pass, {}, {}, mock) + #assert + monitor.must_be_kind_of(AggregateMonitor) + end +end \ No newline at end of file diff --git a/test/code/plugin/health/monitor_set_spec.rb b/test/code/plugin/health/monitor_set_spec.rb new file mode 100644 index 000000000..1f4e970be --- /dev/null +++ b/test/code/plugin/health/monitor_set_spec.rb @@ -0,0 +1,58 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "MonitorSet Spec" do + it "add_or_update -- adds a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + #act + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map.size, 1 + assert_equal set.get_map.key?("monitor_instance_id_1"), true + end + + it "add_or_update -- updates a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + #act + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map["monitor_instance_id_1"].state, :pass + + #act + def mock_monitor.state; :fail;end + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map["monitor_instance_id_1"].state, :fail + end + + it "delete -- delete a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + set.add_or_update(mock_monitor) + + #act + set.delete("monitor_instance_id_1") + #assert + assert_equal set.get_map.size, 0 + end + + it "get_map -- returns a hash" do + #arrange + set = MonitorSet.new + #act and assert + set.get_map.must_be_kind_of(Hash) + end +end diff --git a/test/code/plugin/health/parent_monitor_provider_spec.rb b/test/code/plugin/health/parent_monitor_provider_spec.rb new file mode 100644 index 000000000..a83db50fc --- /dev/null +++ b/test/code/plugin/health/parent_monitor_provider_spec.rb @@ -0,0 +1,144 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "ParentMonitorProvider spec" do + it 'returns correct parent_monitor_id for a non-condition case' do + #arrange + definition = JSON.parse('{ + "monitor_id" : { + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "label_1", + "label_2" + ] + } + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "monitor_id"; end + def monitor.monitor_instance_id; "monitor_instance_id"; end + + #act + parent_id = health_model_definition.get_parent_monitor_id(monitor) + #assert + assert_equal parent_id, "parent_monitor_id" + end + + it 'returns raises for an incorrect monitor id' do + #arrange + definition = JSON.parse('{ + "monitor_id" : { + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "label_1", + "label_2" + ] + } + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "monitor_id_!"; end + def monitor.monitor_instance_id; "monitor_instance_id"; end + + #act and assert + assert_raises do + parent_id = health_model_definition.get_parent_monitor_id(monitor) + end + end + + it 'returns correct parent_monitor_id for a conditional case' do + #arrange + definition = JSON.parse('{"conditional_monitor_id": { + "conditions": [ + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "master", + "parent_id": "master_node_pool" + }, + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "parent_id": "agent_node_pool" + } + ], + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "aggregation_algorithm": "worstOf", + "aggregation_algorithm_params": null + } + + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "conditional_monitor_id"; end + def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end + def monitor.labels; {HealthMonitorLabels::ROLE => "master"}; end + + #act + parent_id = health_model_definition.get_parent_monitor_id(monitor) + #assert + assert_equal parent_id, "master_node_pool" + end + + it 'raises if conditions are not met' do + #arrange + definition = JSON.parse('{"conditional_monitor_id": { + "conditions": [ + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "master", + "parent_id": "master_node_pool" + }, + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "parent_id": "agent_node_pool" + } + ], + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "aggregation_algorithm": "worstOf", + "aggregation_algorithm_params": null + } + + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "conditional_monitor_id"; end + def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end + def monitor.labels; {HealthMonitorLabels::ROLE => "master1"}; end + + #act and assert + assert_raises do + parent_id = health_model_definition.get_parent_monitor_id(monitor) + end + end +end diff --git a/test/code/plugin/health/test_health_model_definition.json b/test/code/plugin/health/test_health_model_definition.json new file mode 100644 index 000000000..31d219705 --- /dev/null +++ b/test/code/plugin/health/test_health_model_definition.json @@ -0,0 +1,42 @@ +[ + { + "monitor_id": "monitor_id", + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "conditional_monitor_id", + "aggregation_algorithm": "worstOf", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": [ + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "master", + "id": "master_node_pool" + }, + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "id": "agent_node_pool" + } + ] + } +] \ No newline at end of file diff --git a/test/code/plugin/health/unit_monitor_spec.rb b/test/code/plugin/health/unit_monitor_spec.rb new file mode 100644 index 000000000..4cbf794db --- /dev/null +++ b/test/code/plugin/health/unit_monitor_spec.rb @@ -0,0 +1,20 @@ +require_relative '../../../../source/code/plugin/health/unit_monitor' +require_relative '../test_helpers' + +include HealthModel + +describe "UnitMonitor Spec" do + it "is_aggregate_monitor is false for UnitMonitor" do + # Arrange/Act + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + # Assert + assert_equal monitor.is_aggregate_monitor, false + end + + it "get_member_monitors is nil for UnitMonitor" do + # Arrange/Act + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + #Assert + assert_nil monitor.get_member_monitors + end +end \ No newline at end of file diff --git a/test/code/plugin/health/unit_monitor_test.rb b/test/code/plugin/health/unit_monitor_test.rb new file mode 100644 index 000000000..e53617c99 --- /dev/null +++ b/test/code/plugin/health/unit_monitor_test.rb @@ -0,0 +1,16 @@ +require_relative '../../../../source/code/plugin/health/unit_monitor' +require_relative '../test_helpers' + +class UnitMonitorTest < Minitest::Test + include HealthModel + + def test_is_aggregate_monitor_false + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + assert_equal monitor.is_aggregate_monitor, false + end + + def test_get_member_monitors_nil + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + assert_nil monitor.get_member_monitors + end +end diff --git a/test/code/plugin/test_helpers.rb b/test/code/plugin/test_helpers.rb new file mode 100644 index 000000000..543f00ac9 --- /dev/null +++ b/test/code/plugin/test_helpers.rb @@ -0,0 +1,3 @@ +gem "minitest" +require "minitest/spec" +require 'minitest/autorun' \ No newline at end of file