diff --git a/Rakefile b/Rakefile
new file mode 100644
index 000000000..3733e71a3
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,9 @@
+require 'rake/testtask'
+
+task default: "test"
+
+Rake::TestTask.new do |task|
+ task.libs << "test"
+ task.pattern = './test/code/plugin/health/*_spec.rb'
+ task.warning = false
+end
\ No newline at end of file
diff --git a/build/Makefile b/build/Makefile
index b5312cfe3..257980160 100644
--- a/build/Makefile
+++ b/build/Makefile
@@ -91,9 +91,9 @@ CXXFLAGS = $(COMPILE_FLAGS)
# Build targets
ifeq ($(ULINUX),1)
-all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin
+all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin rubypluginstests
else
-all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin
+all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin rubypluginstests
endif
clean :
@@ -143,6 +143,15 @@ fluentbitplugin :
make -C $(GO_SOURCE_DIR) fbplugin
$(COPY) $(GO_SOURCE_DIR)/out_oms.so $(INTERMEDIATE_DIR)
+rubypluginstests :
+ @echo "========================= Installing pre-reqs for running tests"
+ sudo apt-add-repository ppa:brightbox/ruby-ng -y
+ sudo apt-get update
+ sudo apt-get install ruby2.4 rake -y
+ sudo gem install minitest
+ @echo "========================= Running tests..."
+ rake test
+
#--------------------------------------------------------------------------------
# PAL build
#
diff --git a/installer/conf/container.conf b/installer/conf/container.conf
index f41bd6f98..6d810a0e2 100755
--- a/installer/conf/container.conf
+++ b/installer/conf/container.conf
@@ -17,16 +17,22 @@
#cadvisor perf
- type cadvisorperf
- tag oms.api.cadvisorperf
- run_interval 60s
+ type cadvisorperf
+ tag oms.api.cadvisorperf
+ run_interval 60s
log_level debug
+
+ type filter_cadvisor_health_node
+ log_level debug
+
+
+
#custom_metrics_mdm filter plugin
type filter_cadvisor2mdm
- custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope
+ custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes
log_level info
@@ -61,6 +67,25 @@
max_retry_wait 9m
+
+
+ @type forward
+ send_timeout 60s
+ recover_wait 10s
+ hard_timeout 60s
+ heartbeat_type tcp
+
+
+ host healthmodel-replicaset-service.kube-system
+ port 25227
+
+
+
+ @type file
+ path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log
+
+
+
type out_mdm
log_level debug
diff --git a/installer/conf/health_model_definition.json b/installer/conf/health_model_definition.json
new file mode 100644
index 000000000..1112fe158
--- /dev/null
+++ b/installer/conf/health_model_definition.json
@@ -0,0 +1,248 @@
+[
+ {
+ "monitor_id": "user_workload_pods_ready",
+ "parent_monitor_id": "user_workload",
+ "labels": [
+ "container.azm.ms/namespace",
+ "container.azm.ms/workload-name",
+ "container.azm.ms/workload-kind",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "user_workload",
+ "parent_monitor_id": "namespace",
+ "labels": [
+ "container.azm.ms/namespace",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "system_workload_pods_ready",
+ "parent_monitor_id": "system_workload",
+ "labels": [
+ "container.azm.ms/namespace",
+ "container.azm.ms/workload-name",
+ "container.azm.ms/workload-kind",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "system_workload",
+ "parent_monitor_id": "k8s_infrastructure",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "kube_api_status",
+ "parent_monitor_id": "k8s_infrastructure",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "namespace",
+ "labels": [
+ "container.azm.ms/namespace",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "parent_monitor_id": "all_namespaces"
+ },
+ {
+ "monitor_id": "k8s_infrastructure",
+ "parent_monitor_id": "cluster",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "all_namespaces",
+ "parent_monitor_id": "all_workloads",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "all_workloads",
+ "parent_monitor_id": "cluster",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "node_cpu_utilization",
+ "parent_monitor_id": "node",
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "node_memory_utilization",
+ "parent_monitor_id": "node",
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "node_condition",
+ "parent_monitor_id": "node",
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "node",
+ "aggregation_algorithm": "worstOf",
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "parent_monitor_id": [
+ {
+ "label": "kubernetes.io/role",
+ "operator": "==",
+ "value": "master",
+ "id": "master_node_pool"
+ },
+ {
+ "label": "kubernetes.io/role",
+ "operator": "==",
+ "value": "agent",
+ "id": "agent_node_pool"
+ }
+ ]
+ },
+ {
+ "monitor_id": "master_node_pool",
+ "aggregation_algorithm": "percentage",
+ "aggregation_algorithm_params": {
+ "critical_threshold": 80.0,
+ "warning_threshold": 90.0
+ },
+ "parent_monitor_id": "all_nodes",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "agent_node_pool",
+ "aggregation_algorithm": "percentage",
+ "aggregation_algorithm_params": {
+ "state_threshold": 80.0
+ },
+ "labels": [
+ "agentpool",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "parent_monitor_id": "all_nodes"
+ },
+ {
+ "monitor_id": "all_nodes",
+ "aggregation_algorithm": "worstOf",
+ "parent_monitor_id": "cluster",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "cluster",
+ "aggregation_algorithm": "worstOf",
+ "parent_monitor_id": null,
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "subscribed_capacity_cpu",
+ "parent_monitor_id": "capacity",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "subscribed_capacity_memory",
+ "parent_monitor_id": "capacity",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "capacity",
+ "parent_monitor_id": "all_workloads",
+ "labels": [
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ }
+]
\ No newline at end of file
diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json
new file mode 100644
index 000000000..28d562652
--- /dev/null
+++ b/installer/conf/healthmonitorconfig.json
@@ -0,0 +1,31 @@
+{
+ "node_cpu_utilization": {
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ },
+ "node_memory_utilization": {
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ },
+ "container_cpu_utilization": {
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ },
+ "container_memory_utilization": {
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ },
+ "user_workload_pods_ready": {
+ "WarnThresholdPercentage": 0.0,
+ "FailThresholdPercentage": 10.0,
+ "ConsecutiveSamplesForStateTransition": 2
+ },
+ "system_workload_pods_ready": {
+ "FailThresholdPercentage": 0.0,
+ "ConsecutiveSamplesForStateTransition": 2
+ }
+}
\ No newline at end of file
diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf
index 0dfa3710e..4b4ec09ea 100644
--- a/installer/conf/kube.conf
+++ b/installer/conf/kube.conf
@@ -1,4 +1,9 @@
# Fluentd config file for OMS Docker - cluster components (kubeAPI)
+
+ type forward
+ port 25227
+ bind 0.0.0.0
+
#Kubernetes pod inventory
@@ -13,7 +18,7 @@
type kubeevents
tag oms.containerinsights.KubeEvents
run_interval 60s
- log_level debug
+ log_level debug
#Kubernetes logs
@@ -47,6 +52,14 @@
log_level debug
+#Kubernetes health
+
+ type kubehealth
+ tag oms.api.KubeHealth.ReplicaSet
+ run_interval 60s
+ log_level debug
+
+
#cadvisor perf- Windows nodes
type wincadvisorperf
@@ -69,6 +82,9 @@
log_level info
+
+ type filter_health_model_builder
+
type out_mdm
log_level debug
@@ -118,7 +134,7 @@
type out_oms_api
log_level debug
- buffer_chunk_limit 10m
+ buffer_chunk_limit 10m
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer
buffer_queue_limit 10
@@ -127,6 +143,8 @@
retry_wait 30s
+
+
type out_oms
log_level debug
@@ -170,7 +188,7 @@
max_retry_wait 9m
-
+
type out_oms
log_level debug
num_threads 5
@@ -214,4 +232,16 @@
retry_limit 10
retry_wait 30s
max_retry_wait 9m
+
+
+
+ type out_oms_api
+ log_level debug
+ buffer_chunk_limit 10m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_api_KubeHealth*.buffer
+ buffer_queue_limit 10
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 30s
\ No newline at end of file
diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data
index 62a6f6885..3dc1a18cd 100644
--- a/installer/datafiles/base_container.data
+++ b/installer/datafiles/base_container.data
@@ -112,10 +112,45 @@ MAINTAINER: 'Microsoft Corporation'
/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root
/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root
/opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root
-/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root
-/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root
+/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root
+/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root
/opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root
+
+
+/opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/code/plugin/filter_cadvisor_health_node.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/code/plugin/filter_health_model_builder.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_kube_health.rb; source/code/plugin/in_kube_health.rb; 644; root; root
+/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; installer/conf/healthmonitorconfig.json; 644; root; root
+/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; installer/conf/health_model_definition.json; 644; root; root
+
+
+/opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/code/plugin/health/aggregate_monitor.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/code/plugin/health/agg_monitor_id_labels.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/code/plugin/health/aggregate_monitor_state_finalizer.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/code/plugin/health/cluster_health_state.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/code/plugin/health/health_hierarchy_builder.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/code/plugin/health/health_kubernetes_resources.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/code/plugin/health/health_kube_api_down_handler.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_missing_signal_generator.rb; source/code/plugin/health/health_missing_signal_generator.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_model_buffer.rb; source/code/plugin/health/health_model_buffer.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_model_builder.rb; source/code/plugin/health/health_model_builder.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_model_constants.rb; source/code/plugin/health/health_model_constants.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/parent_monitor_provider.rb; source/code/plugin/health/parent_monitor_provider.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/code/plugin/health/health_model_definition_parser.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/code/plugin/health/health_monitor_optimizer.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/code/plugin/health/health_monitor_provider.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/code/plugin/health/health_monitor_record.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/code/plugin/health/health_monitor_state.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/code/plugin/health/health_monitor_utils.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/code/plugin/health/health_signal_reducer.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/code/plugin/health/monitor_factory.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/code/plugin/health/monitor_set.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/code/plugin/health/unit_monitor.rb; 644; root; root
+
%Links
/opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root
@@ -129,6 +164,7 @@ MAINTAINER: 'Microsoft Corporation'
/etc/opt/microsoft; 755; root; root; sysdir
/etc/opt/microsoft/docker-cimprov; 755; root; root
/etc/opt/microsoft/docker-cimprov/conf; 755; root; root
+/etc/opt/microsoft/docker-cimprov/health; 755; root; root
/etc/opt/omi; 755; root; root; sysdir
/etc/opt/omi/conf; 755; root; root; sysdir
@@ -142,6 +178,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent; 755; root; root; sysdir
/opt/microsoft/omsagent/plugin; 755; root; root; sysdir
+/opt/microsoft/omsagent/plugin/health; 755; root; root; sysdir
/opt/omi; 755; root; root; sysdir
/opt/omi/lib; 755; root; root; sysdir
@@ -205,12 +242,24 @@ touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log
chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log
chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log
+touch /var/opt/microsoft/docker-cimprov/log/health_monitors.log
+chmod 666 /var/opt/microsoft/docker-cimprov/log/health_monitors.log
+chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/health_monitors.log
+
+touch /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log
+chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log
+chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log
+
+touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log
+chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log
+chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log
+
mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf
chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf
%Postuninstall_10
# If we're an upgrade, skip all of this cleanup
-if ${{PERFORMING_UPGRADE_NOT}}; then
+if ${{PERFORMING_UPGRADE_NOT}}; then
# Clean up installinfo.txt file (registered as "conf" file to pass rpmcheck)
rm -f /etc/opt/microsoft/docker-cimprov/conf/installinfo.txt*
rm -f /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt
diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb
index c72e64127..067586629 100644
--- a/installer/scripts/tomlparser.rb
+++ b/installer/scripts/tomlparser.rb
@@ -1,8 +1,10 @@
#!/usr/local/bin/ruby
require_relative "tomlrb"
+require 'json'
-@configMapMountPath = "/etc/config/settings/log-data-collection-settings"
+@log_settings_config_map_mount_path = "/etc/config/settings/log-data-collection-settings"
+@agent_settings_config_map_mount_path = "/etc/config/settings/agent-settings"
@configVersion = ""
@configSchemaVersion = ""
# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist
@@ -16,16 +18,16 @@
@excludePath = "*.csv2" #some invalid path
# Use parser to parse the configmap toml file to a ruby structure
-def parseConfigMap
+def parseConfigMap(path)
begin
# Check to see if config map is created
- if (File.file?(@configMapMountPath))
- puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values"
- parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true)
- puts "config::Successfully parsed mounted config map"
+ if (File.file?(path))
+ puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values from #{path}"
+ parsedConfig = Tomlrb.load_file(path, symbolize_keys: true)
+ puts "config::Successfully parsed mounted config map from #{path}"
return parsedConfig
else
- puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults"
+ puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for #{path}"
@excludePath = "*_kube-system_*.log"
return nil
end
@@ -117,19 +119,35 @@ def populateSettingValuesFromConfigMap(parsedConfig)
puts "config::error::Exception while reading config settings for cluster level environment variable collection - #{errorStr}, using defaults"
end
end
+
+ begin
+ if !parsedConfig.nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil?
+ @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled]
+ puts "enable_health_model = #{@enable_health_model}"
+ end
+ rescue => errorStr
+ puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults"
+ @enable_health_model = false
+ end
end
@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"]
puts "****************Start Config Processing********************"
if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it
- configMapSettings = parseConfigMap
+ configMapSettings = {}
+
+ #iterate over every *settings file and build a hash of settings
+ Dir["/etc/config/settings/*settings"].each{|file|
+ puts "Parsing File #{file}"
+ settings = parseConfigMap(file)
+ configMapSettings = configMapSettings.merge(settings)
+ }
+
if !configMapSettings.nil?
populateSettingValuesFromConfigMap(configMapSettings)
end
else
- if (File.file?(@configMapMountPath))
puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults"
- end
@excludePath = "*_kube-system_*.log"
end
@@ -155,6 +173,8 @@ def populateSettingValuesFromConfigMap(parsedConfig)
file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n")
file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n")
file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n")
+ #health_model settings
+ file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n")
# Close file after writing all environment variables
file.close
puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' "
diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb
index 61cbaea00..48b25bf14 100644
--- a/source/code/plugin/KubernetesApiClient.rb
+++ b/source/code/plugin/KubernetesApiClient.rb
@@ -30,13 +30,13 @@ def initialize
end
class << self
- def getKubeResourceInfo(resource)
+ def getKubeResourceInfo(resource, api_version: nil)
headers = {}
response = nil
- @Log.info "Getting Kube resource"
+ @Log.info "Getting Kube resource api_version #{api_version}"
@Log.info resource
begin
- resourceUri = getResourceUri(resource)
+ resourceUri = getResourceUri(resource, api_version: api_version)
if !resourceUri.nil?
uri = URI.parse(resourceUri)
http = Net::HTTP.new(uri.host, uri.port)
@@ -76,10 +76,23 @@ def getTokenStr
end
end
- def getResourceUri(resource)
+ def getClusterRegion
+ if ENV["AKS_REGION"]
+ return ENV["AKS_REGION"]
+ else
+ @Log.warn ("Kubernetes environment variable not set AKS_REGION. Unable to get cluster region.")
+ return nil
+ end
+ end
+
+ def getResourceUri(resource, api_version: nil)
begin
if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"]
- return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource
+ if !api_version.nil?
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + api_version + "/" + resource
+ end
+ api_version = @@ApiVersion
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + api_version + "/" + resource
else
@Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri")
return nil
@@ -125,6 +138,8 @@ def getClusterId
return @@ClusterId if !@@ClusterId.nil?
#By default initialize ClusterId to ClusterName.
# In ACS/On-prem, we need to figure out how we can generate ClusterId
+ # Dilipr: Spoof the subid by generating md5 hash of cluster name, and taking some constant parts of it.
+ # e.g. md5 digest is 128 bits = 32 character in hex. Get first 16 and get a guid, and the next 16 to get resource id
@@ClusterId = getClusterName
begin
cluster = ENV["AKS_RESOURCE_ID"]
diff --git a/source/code/plugin/filter_cadvisor_health_container.rb b/source/code/plugin/filter_cadvisor_health_container.rb
new file mode 100644
index 000000000..4090092a9
--- /dev/null
+++ b/source/code/plugin/filter_cadvisor_health_container.rb
@@ -0,0 +1,263 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+module Fluent
+ require 'logger'
+ require 'json'
+ require_relative 'oms_common'
+ require_relative 'HealthMonitorUtils'
+ require_relative 'HealthMonitorState'
+ require_relative "ApplicationInsightsUtility"
+
+
+ class CAdvisor2ContainerHealthFilter < Filter
+ Fluent::Plugin.register_filter('filter_cadvisor_health_container', self)
+
+ config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log'
+ config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes'
+ config_param :container_resource_refresh_interval_minutes, :integer, :default => 5
+
+ @@object_name_k8s_node = 'K8SNode'
+ @@object_name_k8s_container = 'K8SContainer'
+
+ @@counter_name_cpu = 'cpuusagenanocores'
+ @@counter_name_memory_rss = 'memoryrssbytes'
+
+ @@health_monitor_config = {}
+
+ @@hostName = (OMS::Common.get_hostname)
+ @@clusterName = KubernetesApiClient.getClusterName
+ @@clusterId = KubernetesApiClient.getClusterId
+ @@clusterRegion = KubernetesApiClient.getClusterRegion
+ @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled
+
+ def initialize
+ super
+ @cpu_capacity = 0.0
+ @memory_capacity = 0.0
+ @last_resource_refresh = DateTime.now.to_time.to_i
+ @metrics_to_collect_hash = {}
+ end
+
+ def configure(conf)
+ super
+ @log = HealthMonitorUtils.getLogHandle
+ @log.debug {'Starting filter_cadvisor2health plugin'}
+ end
+
+ def start
+ super
+ @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect)
+ @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}"
+ node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName)
+ @cpu_capacity = node_capacity[0]
+ @memory_capacity = node_capacity[1]
+ @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}"
+ #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName)
+ @@health_monitor_config = HealthMonitorUtils.getHealthMonitorConfig
+ ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {})
+ end
+
+ def filter_stream(tag, es)
+ if !@@cluster_health_model_enabled
+ @log.info "Cluster Health Model disabled in filter_cadvisor_health_container"
+ return []
+ end
+ new_es = MultiEventStream.new
+ #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName)
+ records_count = 0
+ es.each { |time, record|
+ begin
+ filtered_record = filter(tag, time, record)
+ if !filtered_record.nil?
+ new_es.add(time, filtered_record)
+ records_count += 1
+ end
+ rescue => e
+ router.emit_error_event(tag, time, record, e)
+ end
+ }
+ @log.debug "Filter Records Count #{records_count}"
+ new_es
+ end
+
+ def filter(tag, time, record)
+ begin
+ if record.key?("MonitorLabels")
+ return record
+ end
+ object_name = record['DataItems'][0]['ObjectName']
+ counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase
+ if @metrics_to_collect_hash.key?(counter_name.downcase)
+ metric_value = record['DataItems'][0]['Collections'][0]['Value']
+ case object_name
+ when @@object_name_k8s_container
+ case counter_name.downcase
+ when @@counter_name_cpu
+ # @log.debug "Object Name #{object_name}"
+ # @log.debug "Counter Name #{counter_name}"
+ # @log.debug "Metric Value #{metric_value}"
+ #return process_container_cpu_record(record, metric_value)
+ when @@counter_name_memory_rss
+ #return process_container_memory_record(record, metric_value)
+ end
+ when @@object_name_k8s_node
+ case counter_name.downcase
+ when @@counter_name_cpu
+ #process_node_cpu_record(record, metric_value)
+ when @@counter_name_memory_rss
+ #process_node_memory_record(record, metric_value)
+ end
+ end
+ end
+ rescue => e
+ @log.debug "Error in filter #{e}"
+ @log.debug "record #{record}"
+ @log.debug "backtrace #{e.backtrace}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e)
+ return nil
+ end
+ end
+
+ def process_container_cpu_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID
+ @log.debug "processing container cpu record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name)
+ container_metadata = HealthMonitorUtils.getContainerMetadata(key)
+ if !container_metadata.nil?
+ cpu_limit = container_metadata['cpuLimit']
+ end
+
+ if cpu_limit.to_s.empty?
+ #@log.info "CPU Limit is nil"
+ cpu_limit = @cpu_capacity
+ end
+
+ #@log.info "cpu limit #{cpu_limit}"
+
+ percent = (metric_value.to_f/cpu_limit*100).round(2)
+ #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}"
+ state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID])
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key])
+ #@log.info "Monitor Instance Id: #{monitor_instance_id}"
+ temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ @log.info "Processed Container CPU #{temp}"
+ return record
+ end
+ return nil
+ end
+
+ def process_container_memory_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID
+ #@log.debug "processing container memory record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name)
+ container_metadata = HealthMonitorUtils.getContainerMetadata(key)
+ if !container_metadata.nil?
+ memory_limit = container_metadata['memoryLimit']
+ end
+
+ if memory_limit.to_s.empty?
+ #@log.info "Memory Limit is nil"
+ memory_limit = @memory_capacity
+ end
+
+ #@log.info "memory limit #{memory_limit}"
+
+ percent = (metric_value.to_f/memory_limit*100).round(2)
+ #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}"
+ state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID])
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key])
+ #@log.info "Monitor Instance Id: #{monitor_instance_id}"
+ temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ @log.info "Processed Container Memory #{temp}"
+ return record
+ end
+ return nil
+ end
+
+ def process_node_cpu_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID
+ #@log.debug "processing node cpu record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ #@log.info "CPU capacity #{@cpu_capacity}"
+
+ percent = (metric_value.to_f/@cpu_capacity*100).round(2)
+ #@log.debug "Percentage of CPU limit: #{percent}"
+ state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_CPU_MONITOR_ID])
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}}
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName])
+ # record = HealthMonitorSignalReducer.reduceSignal(@log, monitor_id, monitor_instance_id, @@health_monitor_config[monitor_id], node_name: @@hostName)
+ # temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName
+ @log.info "Processed Node CPU"
+ return health_record
+ end
+ return nil
+ end
+
+ def process_node_memory_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID
+ #@log.debug "processing node memory record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ #@log.info "Memory capacity #{@memory_capacity}"
+
+ percent = (metric_value.to_f/@memory_capacity*100).round(2)
+ #@log.debug "Percentage of Memory limit: #{percent}"
+ state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_MEMORY_MONITOR_ID])
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName])
+ #@log.info "Monitor Instance Id: #{monitor_instance_id}"
+ # temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName
+ @log.info "Processed Node Memory"
+ return health_record
+ end
+ return nil
+ end
+ end
+end
diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb
new file mode 100644
index 000000000..627a525e7
--- /dev/null
+++ b/source/code/plugin/filter_cadvisor_health_node.rb
@@ -0,0 +1,267 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+module Fluent
+ require 'logger'
+ require 'json'
+ require_relative 'oms_common'
+ require_relative "ApplicationInsightsUtility"
+ require_relative "KubernetesApiClient"
+ Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file }
+
+ class CAdvisor2NodeHealthFilter < Filter
+ include HealthModel
+ Fluent::Plugin.register_filter('filter_cadvisor_health_node', self)
+
+ attr_accessor :provider, :resources
+
+ config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes'
+ config_param :container_resource_refresh_interval_minutes, :integer, :default => 5
+ config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json'
+
+ @@object_name_k8s_node = 'K8SNode'
+ @@object_name_k8s_container = 'K8SContainer'
+
+ @@counter_name_cpu = 'cpuusagenanocores'
+ @@counter_name_memory_rss = 'memoryrssbytes'
+
+ @@hm_log = HealthMonitorUtils.get_log_handle
+ @@hostName = (OMS::Common.get_hostname)
+ @@clusterName = KubernetesApiClient.getClusterName
+ @@clusterId = KubernetesApiClient.getClusterId
+ @@clusterRegion = KubernetesApiClient.getClusterRegion
+ @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled
+
+ def initialize
+ begin
+ super
+ @cpu_capacity = 0.0
+ @memory_capacity = 0.0
+ @last_resource_refresh = DateTime.now.to_time.to_i
+ @metrics_to_collect_hash = {}
+ @resources = HealthKubernetesResources.instance # this doesnt require node and pod inventory. So no need to populate them
+ @provider = HealthMonitorProvider.new(@@clusterId, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path)
+ rescue => e
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ end
+ end
+
+ def configure(conf)
+ super
+ @log = HealthMonitorUtils.get_log_handle
+ @log.debug {'Starting filter_cadvisor2health plugin'}
+ end
+
+ def start
+ super
+ @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect)
+ @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}"
+ node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName)
+ @cpu_capacity = node_capacity[0]
+ @memory_capacity = node_capacity[1]
+ @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}"
+ #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName)
+ ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {})
+ end
+
+ def filter_stream(tag, es)
+ if !@@cluster_health_model_enabled
+ @log.info "Cluster Health Model disabled in filter_cadvisor_health_node"
+ return []
+ end
+ new_es = MultiEventStream.new
+ #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName)
+ records_count = 0
+ es.each { |time, record|
+ begin
+ filtered_record = filter(tag, time, record)
+ if !filtered_record.nil?
+ new_es.add(time, filtered_record)
+ records_count += 1
+ end
+ rescue => e
+ @log.info "Error in filter_stream for filter_cadvisor_health_node #{e.message}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ end
+ }
+ @log.debug "Filter Records Count #{records_count}"
+ new_es
+ end
+
+ def filter(tag, time, record)
+ begin
+ if record.key?("MonitorLabels")
+ return record
+ end
+ object_name = record['DataItems'][0]['ObjectName']
+ counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase
+ if @metrics_to_collect_hash.key?(counter_name.downcase)
+ metric_value = record['DataItems'][0]['Collections'][0]['Value']
+ case object_name
+ when @@object_name_k8s_container
+ case counter_name.downcase
+ when @@counter_name_cpu
+ # @log.debug "Object Name #{object_name}"
+ # @log.debug "Counter Name #{counter_name}"
+ # @log.debug "Metric Value #{metric_value}"
+ #return process_container_cpu_record(record, metric_value)
+ when @@counter_name_memory_rss
+ #return process_container_memory_record(record, metric_value)
+ end
+ when @@object_name_k8s_node
+ case counter_name.downcase
+ when @@counter_name_cpu
+ process_node_cpu_record(record, metric_value)
+ when @@counter_name_memory_rss
+ process_node_memory_record(record, metric_value)
+ end
+ end
+ end
+ rescue => e
+ @log.debug "Error in filter #{e}"
+ @log.debug "record #{record}"
+ @log.debug "backtrace #{e.backtrace}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e)
+ return nil
+ end
+ end
+
+ def process_container_cpu_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID
+ @log.debug "processing container cpu record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name)
+ container_metadata = HealthMonitorUtils.getContainerMetadata(key)
+ if !container_metadata.nil?
+ cpu_limit = container_metadata['cpuLimit']
+ end
+
+ if cpu_limit.to_s.empty?
+ #@log.info "CPU Limit is nil"
+ cpu_limit = @cpu_capacity
+ end
+
+ #@log.info "cpu limit #{cpu_limit}"
+
+ percent = (metric_value.to_f/cpu_limit*100).round(2)
+ #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}"
+ state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(monitor_id))
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key])
+ #@log.info "Monitor Instance Id: #{monitor_instance_id}"
+ temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ @log.info "Processed Container CPU #{temp}"
+ return record
+ end
+ return nil
+ end
+
+ def process_container_memory_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID
+ #@log.debug "processing container memory record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name)
+ container_metadata = HealthMonitorUtils.getContainerMetadata(key)
+ if !container_metadata.nil?
+ memory_limit = container_metadata['memoryLimit']
+ end
+
+ if memory_limit.to_s.empty?
+ #@log.info "Memory Limit is nil"
+ memory_limit = @memory_capacity
+ end
+
+ #@log.info "memory limit #{memory_limit}"
+
+ percent = (metric_value.to_f/memory_limit*100).round(2)
+ #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}"
+ state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID))
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key])
+ #@log.info "Monitor Instance Id: #{monitor_instance_id}"
+ temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ @log.info "Processed Container Memory #{temp}"
+ return record
+ end
+ return nil
+ end
+
+ def process_node_cpu_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID
+ #@log.debug "processing node cpu record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ #@log.info "CPU capacity #{@cpu_capacity}"
+
+ percent = (metric_value.to_f/@cpu_capacity*100).round(2)
+ #@log.debug "Percentage of CPU limit: #{percent}"
+ state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_CPU_MONITOR_ID))
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}}
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName])
+ # temp = record.nil? ? "Nil" : record["MonitorInstanceId"]
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName
+ @log.info "Processed Node CPU"
+ return health_record
+ end
+ return nil
+ end
+
+ def process_node_memory_record(record, metric_value)
+ monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID
+ #@log.debug "processing node memory record"
+ if record.nil?
+ return nil
+ else
+ instance_name = record['DataItems'][0]['InstanceName']
+ #@log.info "Memory capacity #{@memory_capacity}"
+
+ percent = (metric_value.to_f/@memory_capacity*100).round(2)
+ #@log.debug "Percentage of Memory limit: #{percent}"
+ state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_MEMORY_MONITOR_ID))
+ #@log.debug "Computed State : #{state}"
+ timestamp = record['DataItems'][0]['Timestamp']
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}}
+ #@log.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName])
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName
+ @log.info "Processed Node Memory"
+ return health_record
+ end
+ return nil
+ end
+ end
+end
diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb
new file mode 100644
index 000000000..0c1b378a0
--- /dev/null
+++ b/source/code/plugin/filter_health_model_builder.rb
@@ -0,0 +1,233 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+
+# frozen_string_literal: true
+
+module Fluent
+ require 'logger'
+ require 'json'
+ Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file }
+
+
+ class FilterHealthModelBuilder < Filter
+ Fluent::Plugin.register_filter('filter_health_model_builder', self)
+
+ config_param :enable_log, :integer, :default => 0
+ config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log'
+ config_param :model_definition_path, :default => '/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json'
+ config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json'
+ config_param :health_state_serialized_path, :default => '/mnt/azure/health_model_state.json'
+ attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator
+ include HealthModel
+
+ @@rewrite_tag = 'oms.api.KubeHealth.AgentCollectionTime'
+ @@cluster_id = KubernetesApiClient.getClusterId
+ @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+ @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+ @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled
+
+ def initialize
+ begin
+ super
+ @buffer = HealthModel::HealthModelBuffer.new
+ @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path)
+ @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file)
+ @monitor_factory = HealthModel::MonitorFactory.new
+ @hierarchy_builder = HealthHierarchyBuilder.new(@health_model_definition, @monitor_factory)
+ # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side
+ @state_finalizers = [HealthModel::AggregateMonitorStateFinalizer.new]
+ @monitor_set = HealthModel::MonitorSet.new
+ @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set)
+ @kube_api_down_handler = HealthKubeApiDownHandler.new
+ @resources = HealthKubernetesResources.instance
+ @reducer = HealthSignalReducer.new
+ @state = HealthMonitorState.new
+ @generator = HealthMissingSignalGenerator.new
+ #TODO: cluster_labels needs to be initialized
+ @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path)
+ deserialized_state_info = @cluster_health_state.get_state
+ @state = HealthMonitorState.new
+ @state.initialize_state(deserialized_state_info)
+ @cluster_old_state = 'none'
+ @cluster_new_state = 'none'
+ rescue => e
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ end
+ end
+
+ def configure(conf)
+ begin
+ super
+ @log = nil
+ if @enable_log
+ @log = Logger.new(@log_path, 'weekly')
+ @log.info 'Starting filter_health_model_builder plugin'
+ end
+ rescue => e
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ end
+ end
+
+ def start
+ super
+ end
+
+ def shutdown
+ super
+ end
+
+ def filter_stream(tag, es)
+ begin
+ if !@@cluster_health_model_enabled
+ @log.info "Cluster Health Model disabled in filter_health_model_builder"
+ return []
+ end
+ new_es = MultiEventStream.new
+ time = Time.now
+
+ if tag.start_with?("oms.api.KubeHealth.DaemonSet")
+ records = []
+ if !es.nil?
+ es.each{|time, record|
+ records.push(record)
+ }
+ @buffer.add_to_buffer(records)
+ end
+ return []
+ elsif tag.start_with?("oms.api.KubeHealth.ReplicaSet")
+ @log.info "TAG #{tag}"
+ records = []
+ es.each{|time, record|
+ records.push(record)
+ }
+ @buffer.add_to_buffer(records)
+ records_to_process = @buffer.get_buffer
+ @buffer.reset_buffer
+
+ health_monitor_records = []
+ records_to_process.each do |record|
+ monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID]
+ monitor_id = record[HealthMonitorRecordFields::MONITOR_ID]
+ #HealthMonitorRecord
+ health_monitor_record = HealthMonitorRecord.new(
+ record[HealthMonitorRecordFields::MONITOR_ID],
+ record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID],
+ record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED],
+ record[HealthMonitorRecordFields::DETAILS]["state"],
+ @provider.get_labels(record),
+ @provider.get_config(monitor_id),
+ record[HealthMonitorRecordFields::DETAILS]
+ )
+
+ health_monitor_records.push(health_monitor_record)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ end
+
+ @log.info "health_monitor_records.size #{health_monitor_records.size}"
+ # Dedupe daemonset signals
+ # Remove unit monitor signals for “gone” objects
+ # update state for the reduced set of signals
+ reduced_records = @reducer.reduce_signals(health_monitor_records, @resources)
+ reduced_records.each{|record|
+ @state.update_state(record,
+ @provider.get_config(record.monitor_id)
+ )
+ # get the health state based on the monitor's operational state
+ # update state calls updates the state of the monitor based on configuration and history of the the monitor records
+ record.state = @state.get_state(record.monitor_instance_id).new_state
+ }
+ @log.info "after deduping and removing gone objects reduced_records.size #{reduced_records.size}"
+
+ reduced_records = @kube_api_down_handler.handle_kube_api_down(reduced_records)
+ @log.info "after kube api down handler health_monitor_records.size #{health_monitor_records.size}"
+
+ #get the list of 'none' and 'unknown' signals
+ missing_signals = @generator.get_missing_signals(@@cluster_id, reduced_records, @resources, @provider)
+
+ @log.info "after getting missing signals missing_signals.size #{missing_signals.size}"
+ #update state for missing signals
+ missing_signals.each{|signal|
+
+ @state.update_state(signal, @provider.get_config(signal.monitor_id))
+ @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}"
+ # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state
+ signal.state = @state.get_state(signal.monitor_instance_id).new_state
+ }
+
+ @generator.update_last_received_records(reduced_records)
+ all_records = reduced_records.clone
+ all_records.push(*missing_signals)
+
+ @log.info "after Adding missing signals all_records.size #{all_records.size}"
+
+ # build the health model
+ @model_builder.process_records(all_records)
+ all_monitors = @model_builder.finalize_model
+
+ @log.info "after building health_model #{all_monitors.size}"
+
+ # update the state for aggregate monitors (unit monitors are updated above)
+ all_monitors.each{|monitor_instance_id, monitor|
+ if monitor.is_aggregate_monitor
+ @state.update_state(monitor,
+ @provider.get_config(monitor.monitor_id)
+ )
+ end
+
+ instance_state = @state.get_state(monitor_instance_id)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ should_send = instance_state.should_send
+
+ # always send cluster monitor as a heartbeat
+ if !should_send && monitor_instance_id != MonitorId::CLUSTER
+ all_monitors.delete(monitor_instance_id)
+ end
+ }
+
+ @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}"
+
+ # for each key in monitor.keys,
+ # get the state from health_monitor_state
+ # generate the record to send
+ all_monitors.keys.each{|key|
+ record = @provider.get_record(all_monitors[key], state)
+ if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER && all_monitors.size > 1
+ old_state = record[HealthMonitorRecordFields::OLD_STATE]
+ new_state = record[HealthMonitorRecordFields::NEW_STATE]
+ if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state
+ ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size})
+ @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}"
+ @cluster_old_state = old_state
+ @cluster_new_state = new_state
+ end
+ end
+ #@log.info "#{record["Details"]} #{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}"
+ new_es.add(time, record)
+ }
+
+ #emit the stream
+ router.emit_stream(@@rewrite_tag, new_es)
+
+ #initialize monitor_set and model_builder
+ @monitor_set = HealthModel::MonitorSet.new
+ @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set)
+
+ #update cluster state custom resource
+ @cluster_health_state.update_state(@state.to_h)
+
+ # return an empty event stream, else the match will throw a NoMethodError
+ return []
+ elsif tag.start_with?("oms.api.KubeHealth.AgentCollectionTime")
+ # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream
+ es
+ else
+ raise 'Invalid tag #{tag} received'
+ end
+
+ rescue => e
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}"
+ return nil
+ end
+ end
+ end
+end
diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb
new file mode 100644
index 000000000..48ca46184
--- /dev/null
+++ b/source/code/plugin/health/agg_monitor_id_labels.rb
@@ -0,0 +1,26 @@
+module HealthModel
+ class AggregateMonitorInstanceIdLabels
+ @@id_labels_mapping = {
+ MonitorId::SYSTEM_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME],
+ MonitorId::USER_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME],
+ MonitorId::NODE => [HealthMonitorLabels::AGENTPOOL, HealthMonitorLabels::ROLE, HealthMonitorLabels::HOSTNAME],
+ MonitorId::NAMESPACE => [HealthMonitorLabels::NAMESPACE],
+ MonitorId::AGENT_NODE_POOL => [HealthMonitorLabels::AGENTPOOL],
+ # MonitorId::ALL_AGENT_NODE_POOLS => [],
+ # MonitorId::ALL_NODE_POOLS => [],
+ # MonitorId::ALL_NODES => [],
+ # MonitorId::K8S_INFRASTRUCTURE => [],
+ # MonitorId::CLUSTER => [],
+ # MonitorId::WORKLOAD => []
+ }
+
+ def self.get_labels_for(monitor_id)
+ if @@id_labels_mapping.key?(monitor_id)
+ return @@id_labels_mapping[monitor_id]
+ else
+ return []
+ end
+
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/aggregate_monitor.rb b/source/code/plugin/health/aggregate_monitor.rb
new file mode 100644
index 000000000..794f716ce
--- /dev/null
+++ b/source/code/plugin/health/aggregate_monitor.rb
@@ -0,0 +1,193 @@
+# frozen_string_literal: true
+
+require_relative 'health_model_constants'
+require 'json'
+
+module HealthModel
+ class AggregateMonitor
+ attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :aggregation_algorithm, :aggregation_algorithm_params, :labels, :is_aggregate_monitor, :details
+ attr_reader :member_monitors, :member_state_counts
+
+ @@sort_key_order = {
+ MonitorState::UNKNOWN => 1,
+ MonitorState::CRITICAL => 2,
+ MonitorState::WARNING => 3,
+ MonitorState::HEALTHY => 4,
+ MonitorState::NONE => 5
+ }
+
+ # constructor
+ def initialize(
+ monitor_id,
+ monitor_instance_id,
+ state,
+ transition_date_time,
+ aggregation_algorithm,
+ aggregation_algorithm_params,
+ labels
+ )
+ @monitor_id = monitor_id
+ @monitor_instance_id = monitor_instance_id
+ @state = state
+ @transition_date_time = transition_date_time
+ @aggregation_algorithm = aggregation_algorithm || AggregationAlgorithm::WORSTOF
+ @aggregation_algorithm_params = aggregation_algorithm_params
+ @labels = labels
+ @member_monitors = {}
+ @member_state_counts = {}
+ @is_aggregate_monitor = true
+ end
+
+ # adds a member monitor as a child
+ def add_member_monitor(member_monitor_instance_id)
+ unless @member_monitors.key?(member_monitor_instance_id)
+ @member_monitors[member_monitor_instance_id] = true
+ end
+ end
+
+ #removes a member monitor
+ def remove_member_monitor(member_monitor_instance_id)
+ if @member_monitors.key?(member_monitor_instance_id)
+ @member_monitors.delete(member_monitor_instance_id)
+ end
+ end
+
+ # return the member monitors as an array
+ def get_member_monitors
+ @member_monitors.map(&:first)
+ end
+
+ # calculates the state of the aggregate monitor based on aggregation algorithm and child monitor states
+ def calculate_state(monitor_set)
+ case @aggregation_algorithm
+ when AggregationAlgorithm::WORSTOF
+ @state = calculate_worst_of_state(monitor_set)
+ when AggregationAlgorithm::PERCENTAGE
+ @state = calculate_percentage_state(monitor_set)
+ else
+ raise 'No aggregation algorithm specified'
+ end
+ end
+
+ def calculate_details(monitor_set)
+ @details = {}
+ @details['details'] = {}
+ @details['state'] = state
+ @details['timestamp'] = transition_date_time
+ ids = []
+ member_monitor_instance_ids = get_member_monitors
+ member_monitor_instance_ids.each{|member_monitor_id|
+ member_monitor = monitor_set.get_monitor(member_monitor_id)
+ member_state = member_monitor.state
+ if @details['details'].key?(member_state)
+ ids = @details['details'][member_state]
+ if !ids.include?(member_monitor.monitor_instance_id)
+ ids.push(member_monitor.monitor_instance_id)
+ end
+ @details['details'][member_state] = ids
+ else
+ @details['details'][member_state] = [member_monitor.monitor_instance_id]
+ end
+ }
+ end
+
+ # calculates the worst of state, given the member monitors
+ def calculate_worst_of_state(monitor_set)
+
+ @member_state_counts = map_member_monitor_states(monitor_set)
+
+ if member_state_counts.length === 0
+ return MonitorState::NONE
+ end
+
+ if member_state_counts.key?(MonitorState::CRITICAL) && member_state_counts[MonitorState::CRITICAL] > 0
+ return MonitorState::CRITICAL
+ end
+ if member_state_counts.key?(MonitorState::ERROR) && member_state_counts[MonitorState::ERROR] > 0
+ return MonitorState::ERROR
+ end
+ if member_state_counts.key?(MonitorState::WARNING) && member_state_counts[MonitorState::WARNING] > 0
+ return MonitorState::WARNING
+ end
+
+ if member_state_counts.key?(MonitorState::UNKNOWN) && member_state_counts[MonitorState::UNKNOWN] > 0
+ return MonitorState::UNKNOWN
+ end
+
+ if member_state_counts.key?(MonitorState::HEALTHY) && member_state_counts[MonitorState::HEALTHY] > 0
+ return MonitorState::HEALTHY #healthy should win over none in aggregation
+ end
+
+ return MonitorState::NONE
+
+ end
+
+ # calculates a percentage state, given the aggregation algorithm parameters
+ def calculate_percentage_state(monitor_set)
+
+ #sort
+ #TODO: What if sorted_filtered is empty? is that even possible?
+ sorted_filtered = sort_filter_member_monitors(monitor_set)
+
+ state_threshold = @aggregation_algorithm_params['state_threshold'].to_f
+
+ size = sorted_filtered.size
+ if size == 1
+ @state = sorted_filtered[0].state
+ else
+ count = ((state_threshold*size)/100).ceil
+ index = size - count
+ @state = sorted_filtered[index].state
+ end
+ end
+
+ # maps states of member monitors to counts
+ def map_member_monitor_states(monitor_set)
+ member_monitor_instance_ids = get_member_monitors
+ if member_monitor_instance_ids.nil? || member_monitor_instance_ids.size == 0
+ return {}
+ end
+
+ state_counts = {}
+
+ member_monitor_instance_ids.each {|monitor_instance_id|
+
+ member_monitor = monitor_set.get_monitor(monitor_instance_id)
+ monitor_state = member_monitor.state
+
+ if !state_counts.key?(monitor_state)
+ state_counts[monitor_state] = 1
+ else
+ count = state_counts[monitor_state]
+ state_counts[monitor_state] = count+1
+ end
+ }
+
+ return state_counts;
+ end
+
+ # Sort the member monitors in the following order
+=begin
+ 1. Error
+ 2. Unknown
+ 3. Critical
+ 4. Warning
+ 5. Healthy
+ Remove 'none' state monitors
+=end
+ def sort_filter_member_monitors(monitor_set)
+ member_monitor_instance_ids = get_member_monitors
+ member_monitors = []
+
+ member_monitor_instance_ids.each {|monitor_instance_id|
+ member_monitor = monitor_set.get_monitor(monitor_instance_id)
+ member_monitors.push(member_monitor)
+ }
+
+ filtered = member_monitors.select{|monitor| monitor.state != MonitorState::NONE}
+ sorted = filtered.sort_by{ |monitor| [@@sort_key_order[monitor.state]] }
+
+ return sorted
+ end
+ end
+end
diff --git a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb
new file mode 100644
index 000000000..74e780924
--- /dev/null
+++ b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb
@@ -0,0 +1,33 @@
+module HealthModel
+ class AggregateMonitorStateFinalizer
+
+ def finalize(monitor_set)
+ top_level_monitor = monitor_set.get_monitor(MonitorId::CLUSTER)
+ if !top_level_monitor.nil?
+ calculate_subtree_state(top_level_monitor, monitor_set)
+ end
+ monitor_set.get_map.each{|k,v|
+ if v.is_aggregate_monitor
+ v.calculate_details(monitor_set)
+ end
+ }
+ end
+
+ private
+ def calculate_subtree_state(monitor, monitor_set)
+ if monitor.nil? || !monitor.is_aggregate_monitor
+ raise 'AggregateMonitorStateFinalizer:calculateSubtreeState Parameter monitor must be non-null AggregateMonitor'
+ end
+
+ member_monitor_instance_ids = monitor.get_member_monitors # monitor_instance_ids
+ member_monitor_instance_ids.each{|member_monitor_instance_id|
+ member_monitor = monitor_set.get_monitor(member_monitor_instance_id)
+
+ if !member_monitor.nil? && member_monitor.is_aggregate_monitor
+ calculate_subtree_state(member_monitor, monitor_set)
+ end
+ }
+ monitor.calculate_state(monitor_set)
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb
new file mode 100644
index 000000000..ac7e05675
--- /dev/null
+++ b/source/code/plugin/health/cluster_health_state.rb
@@ -0,0 +1,115 @@
+require "net/http"
+require "net/https"
+require "uri"
+
+module HealthModel
+ class ClusterHealthState
+
+ attr_reader :token_file_path, :cert_file_path, :log, :http_client, :uri, :token
+ @@resource_uri_template = "%{kube_api_server_url}/apis/azmon.container.insights/v1/namespaces/kube-system/healthstates/cluster-health-state"
+
+ def initialize(token_file_path, cert_file_path)
+ @token_file_path = token_file_path
+ @cert_file_path = cert_file_path
+ @log = HealthMonitorHelpers.get_log_handle
+ @http_client = get_http_client
+ @token = get_token
+ end
+
+ def update_state(state)
+ get_request = Net::HTTP::Get.new(@uri.request_uri)
+
+ get_request["Authorization"] = "Bearer #{@token}"
+ @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}"
+ get_response = @http_client.request(get_request)
+ @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}"
+
+ if get_response.code.to_i == 404 # NOT found
+ #POST
+ update_request = Net::HTTP::Post.new(@uri.request_uri)
+ update_request["Content-Type"] = "application/json"
+
+ elsif get_response.code.to_i == 200 # Update == Patch
+ #PATCH
+ update_request = Net::HTTP::Patch.new(@uri.request_uri)
+ update_request["Content-Type"] = "application/merge-patch+json"
+ end
+ update_request["Authorization"] = "Bearer #{@token}"
+
+ update_request_body = get_update_request_body
+ update_request_body["state"] = state.to_json
+ update_request.body = update_request_body.to_json
+
+ update_response = @http_client.request(update_request)
+ @log.info "Got a response of #{update_response.code} for #{update_request.method}"
+ end
+
+ def get_state
+ get_request = Net::HTTP::Get.new(@uri.request_uri)
+ get_request["Authorization"] = "Bearer #{@token}"
+ @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}"
+ get_response = @http_client.request(get_request)
+ @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}"
+
+ if get_response.code.to_i == 200
+ return JSON.parse(JSON.parse(get_response.body)["state"])
+ else
+ return {}
+ end
+ end
+
+ private
+ def get_token()
+ begin
+ if File.exist?(@token_file_path) && File.readable?(@token_file_path)
+ token_str = File.read(@token_file_path).strip
+ return token_str
+ else
+ @log.info ("Unable to read token string from #{@token_file_path}")
+ return nil
+ end
+ end
+ end
+
+ def get_http_client()
+ kube_api_server_url = get_kube_api_server_url
+ resource_uri = @@resource_uri_template % {
+ kube_api_server_url: kube_api_server_url
+ }
+ @uri = URI.parse(resource_uri)
+ http = Net::HTTP.new(@uri.host, @uri.port)
+ http.use_ssl = true
+ if !File.exist?(@cert_file_path)
+ raise "#{@cert_file_path} doesnt exist"
+ else
+ http.ca_file = @cert_file_path
+ end
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
+ return http
+ end
+
+ def get_kube_api_server_url
+ if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"]
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}"
+ else
+ @log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri")
+ if Gem.win_platform? #unit testing on windows dev machine
+ value = %x( kubectl -n default get endpoints kubernetes --no-headers)
+ url = "https://#{value.split(' ')[1]}"
+ return "https://localhost:8080" # This is NEVER used. this is just to return SOME value
+ end
+ return nil
+ end
+ end
+
+ def get_update_request_body
+ body = {}
+ body["apiVersion"] = "azmon.container.insights/v1"
+ body["kind"] = "HealthState"
+ body["metadata"] = {}
+ body["metadata"]["name"] = "cluster-health-state"
+ body["metadata"]["namespace"] = "kube-system"
+ return body
+ end
+ end
+end
diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb
new file mode 100644
index 000000000..2da0050db
--- /dev/null
+++ b/source/code/plugin/health/health_hierarchy_builder.rb
@@ -0,0 +1,76 @@
+require 'json'
+module HealthModel
+ class HealthHierarchyBuilder
+
+ attr_accessor :health_model_definition, :monitor_factory
+
+ def initialize(health_model_definition, monitor_factory)
+
+ if !health_model_definition.is_a?(ParentMonitorProvider)
+ raise "Invalid Type Expected: ParentMonitorProvider Actual: #{@health_model_definition.class.name}"
+ end
+ @health_model_definition = health_model_definition
+
+ if !monitor_factory.is_a?(MonitorFactory)
+ raise "Invalid Type Expected: MonitorFactory Actual: #{@monitor_factory.class.name}"
+ end
+ @monitor_factory = monitor_factory
+ end
+
+ def process_record(health_monitor_record, monitor_set)
+ if !health_monitor_record.is_a?(HealthMonitorRecord)
+ raise "Unexpected Type #{health_monitor_record.class}"
+ end
+
+ # monitor state transition will always be on a unit monitor
+ child_monitor = @monitor_factory.create_unit_monitor(health_monitor_record)
+ monitor_set.add_or_update(child_monitor)
+ parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor)
+ monitor_labels = child_monitor.labels
+ monitor_id = child_monitor.monitor_id
+
+ # to construct the parent monitor,
+ # 1. Child's labels
+ # 2. Parent monitor's config to determine what labels to copy
+ # 3. Parent Monitor Id
+ # 4. Monitor Id --> Labels to hash Mapping to generate the monitor instance id for aggregate monitors
+
+ while !parent_monitor_id.nil?
+ #puts "Parent Monitor Id #{parent_monitor_id}"
+ # get the set of labels to copy to parent monitor
+ parent_monitor_labels = @health_model_definition.get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id)
+ # get the parent monitor configuration
+ parent_monitor_configuration = @health_model_definition.get_parent_monitor_config(parent_monitor_id)
+ #get monitor instance id for parent monitor. Does this belong in ParentMonitorProvider?
+ parent_monitor_instance_id = @health_model_definition.get_parent_monitor_instance_id(child_monitor.monitor_instance_id, parent_monitor_id, parent_monitor_labels)
+ # check if monitor set has the parent monitor id
+ # if not present, add
+ # if present, update the state based on the aggregation algorithm
+ parent_monitor = nil
+ if !monitor_set.contains?(parent_monitor_instance_id)
+ parent_monitor = @monitor_factory.create_aggregate_monitor(parent_monitor_id, parent_monitor_instance_id, parent_monitor_labels, parent_monitor_configuration['aggregation_algorithm'], parent_monitor_configuration['aggregation_algorithm_params'], child_monitor)
+ parent_monitor.add_member_monitor(child_monitor.monitor_instance_id)
+ else
+ parent_monitor = monitor_set.get_monitor(parent_monitor_instance_id)
+ # required to calculate the rollup state
+ parent_monitor.add_member_monitor(child_monitor.monitor_instance_id)
+ # update to the earliest of the transition times of child monitors
+ if child_monitor.transition_date_time < parent_monitor.transition_date_time
+ parent_monitor.transition_date_time = child_monitor.transition_date_time
+ end
+ end
+
+ if parent_monitor.nil?
+ raise 'Parent_monitor should not be nil for #{monitor_id}'
+ end
+
+ monitor_set.add_or_update(parent_monitor)
+
+ child_monitor = parent_monitor
+ parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor)
+ monitor_labels = child_monitor.labels
+ monitor_id = child_monitor.monitor_id
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb
new file mode 100644
index 000000000..7f7ba1bd3
--- /dev/null
+++ b/source/code/plugin/health/health_kube_api_down_handler.rb
@@ -0,0 +1,27 @@
+module HealthModel
+ class HealthKubeApiDownHandler
+ def initialize
+ @@monitors_to_change = [HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID,
+ HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID,
+ HealthMonitorConstants::NODE_CONDITION_MONITOR_ID,
+ HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID,
+ HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID]
+ end
+
+ # update kube-api dependent monitors to be 'unknown' if kube-api is down or monitor is unavailable
+ def handle_kube_api_down(health_monitor_records)
+ health_monitor_records_map = {}
+
+ health_monitor_records.map{|record| health_monitor_records_map[record.monitor_instance_id] = record}
+ if !health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) || (health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) && health_monitor_records_map[HealthMonitorConstants::KUBE_API_STATUS].state != 'pass')
+ #iterate over the map and set the state to unknown for related monitors
+ health_monitor_records.each{|health_monitor_record|
+ if @@monitors_to_change.include?(health_monitor_record.monitor_id)
+ health_monitor_record.state = HealthMonitorStates::UNKNOWN
+ end
+ }
+ end
+ return health_monitor_records
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb
new file mode 100644
index 000000000..53f879bf5
--- /dev/null
+++ b/source/code/plugin/health/health_kubernetes_resources.rb
@@ -0,0 +1,102 @@
+require 'singleton'
+
+module HealthModel
+ class HealthKubernetesResources
+
+ include Singleton
+ attr_accessor :node_inventory, :pod_inventory, :deployment_inventory
+ attr_reader :nodes, :pods, :workloads
+
+ def initialize
+ @node_inventory = []
+ @pod_inventory = []
+ @deployment_inventory = []
+ @nodes = []
+ @pods = []
+ @workloads = []
+ @log = HealthMonitorHelpers.get_log_handle
+ end
+
+ def get_node_inventory
+ return @node_inventory
+ end
+
+ def get_nodes
+ @nodes = []
+ @node_inventory['items'].each {|node|
+ if !@nodes.include?(node['metadata']['name'])
+ @nodes.push(node['metadata']['name'])
+ end
+
+ }
+ return @nodes
+ end
+
+ def get_pod_inventory
+ return @pod_inventory
+ end
+
+ def get_pods
+ return @pods
+ end
+
+ def get_workload_names
+ @pods = []
+ workload_names = {}
+ deployment_lookup = {}
+ @deployment_inventory['items'].each do |deployment|
+ match_labels = deployment['spec']['selector']['matchLabels'].to_h
+ namespace = deployment['metadata']['namespace']
+ match_labels.each{|k,v|
+ deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}"
+ }
+ end
+ @pod_inventory['items'].each do |pod|
+ begin
+ has_owner = !pod['metadata']['ownerReferences'].nil?
+ owner_kind = ''
+ if has_owner
+ owner_kind = pod['metadata']['ownerReferences'][0]['kind']
+ controller_name = pod['metadata']['ownerReferences'][0]['name']
+ else
+ owner_kind = pod['kind']
+ controller_name = pod['metadata']['name']
+ end
+
+ namespace = pod['metadata']['namespace']
+
+ workload_name = ''
+ if owner_kind.nil?
+ owner_kind = 'Pod'
+ end
+ case owner_kind.downcase
+ when 'job'
+ # we are excluding jobs
+ next
+ when 'replicaset'
+ # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name
+ labels = pod['metadata']['labels'].to_h
+ labels.each {|k,v|
+ lookup_key = "#{namespace}-#{k}=#{v}"
+ if deployment_lookup.key?(lookup_key)
+ workload_name = deployment_lookup[lookup_key]
+ break
+ end
+ }
+ if workload_name.empty?
+ workload_name = "#{namespace}~~#{controller_name}"
+ end
+ when 'daemonset'
+ workload_name = "#{namespace}~~#{controller_name}"
+ else
+ workload_name = "#{namespace}~~#{pod['metadata']['name']}"
+ end
+ rescue => e
+ @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}"
+ end
+ workload_names[workload_name] = true
+ end
+ return workload_names.keys
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb
new file mode 100644
index 000000000..ff7f6a390
--- /dev/null
+++ b/source/code/plugin/health/health_missing_signal_generator.rb
@@ -0,0 +1,142 @@
+module HealthModel
+ class HealthMissingSignalGenerator
+ attr_accessor :last_received_records, :current_received_records
+ attr_reader :missing_signals, :unknown_signals_hash
+
+ def initialize()
+ @last_received_records = {}
+ @unknown_signals_hash = {}
+ end
+
+ def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory, provider)
+ missing_monitor_ids = []
+ nodes = health_k8s_inventory.get_nodes
+ workload_names = health_k8s_inventory.get_workload_names
+ missing_signals_map = {}
+ missing_signals = []
+ health_monitor_records_map = {}
+ health_monitor_records.map{
+ |monitor| health_monitor_records_map[monitor.monitor_instance_id] = monitor
+ }
+
+ node_signals_hash = {}
+ nodes.each{|node|
+ node_signals_hash[node] = [HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID]
+ }
+ log = HealthMonitorHelpers.get_log_handle
+ log.info "last_received_records #{@last_received_records.size} nodes #{nodes}"
+ @last_received_records.each{|monitor_instance_id, monitor|
+ if !health_monitor_records_map.key?(monitor_instance_id)
+ if HealthMonitorHelpers.is_node_monitor(monitor.monitor_id)
+ node_name = monitor.labels['kubernetes.io/hostname']
+ new_monitor = HealthMonitorRecord.new(
+ monitor.monitor_id,
+ monitor.monitor_instance_id,
+ Time.now.utc.iso8601,
+ monitor.state,
+ monitor.labels,
+ monitor.config,
+ {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""}
+ )
+ if !node_name.nil? && nodes.include?(node_name)
+ new_monitor.state = HealthMonitorStates::UNKNOWN
+ new_monitor.details["state"] = HealthMonitorStates::UNKNOWN
+ new_monitor.details["details"] = "Node present in inventory but no signal for #{monitor.monitor_id} from node #{node_name}"
+ @unknown_signals_hash[monitor_instance_id] = new_monitor
+ elsif !node_name.nil? && !nodes.include?(node_name)
+ new_monitor.state = HealthMonitorStates::NONE
+ new_monitor.details["state"] = HealthMonitorStates::NONE
+ new_monitor.details["details"] = "Node NOT present in inventory. node: #{node_name}"
+ end
+ missing_signals_map[monitor_instance_id] = new_monitor
+ log.info "Added missing signal #{new_monitor.monitor_instance_id} #{new_monitor.state}"
+ elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor.monitor_id)
+ lookup = "#{monitor.labels[HealthMonitorLabels::NAMESPACE]}~~#{monitor.labels[HealthMonitorLabels::WORKLOAD_NAME]}"
+ new_monitor = HealthMonitorRecord.new(
+ monitor.monitor_id,
+ monitor.monitor_instance_id,
+ Time.now.utc.iso8601,
+ monitor.state,
+ monitor.labels,
+ monitor.config,
+ {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""}
+ )
+ if !lookup.nil? && workload_names.include?(lookup)
+ new_monitor.state = HealthMonitorStates::UNKNOWN
+ new_monitor.details["state"] = HealthMonitorStates::UNKNOWN
+ new_monitor.details["details"] = "Workload present in inventory. But no signal for #{lookup}"
+ @unknown_signals_hash[monitor_instance_id] = new_monitor
+ elsif !lookup.nil? && !workload_names.include?(lookup)
+ new_monitor.state = HealthMonitorStates::NONE
+ new_monitor.details["state"] = HealthMonitorStates::NONE
+ new_monitor.details["details"] = "Workload #{lookup} NOT present in inventory"
+ end
+ missing_signals_map[monitor_instance_id] = new_monitor
+ end
+ end
+ }
+
+
+ health_monitor_records.each{|health_monitor_record|
+ # remove signals from the list of expected signals if we see them in the list of current signals
+ if HealthMonitorHelpers.is_node_monitor(health_monitor_record.monitor_id)
+ node_name = health_monitor_record.labels['kubernetes.io/hostname']
+ if node_signals_hash.key?(node_name)
+ signals = node_signals_hash[node_name]
+ signals.delete(health_monitor_record.monitor_id)
+ if signals.size == 0
+ node_signals_hash.delete(node_name)
+ end
+ end
+ end
+ }
+
+ # if the hash is not empty, means we have missing signals
+ if node_signals_hash.size > 0
+ # these signals were not sent previously
+ # these signals need to be assigned an unknown state
+ node_signals_hash.each{|node, monitor_ids|
+ monitor_ids.each{|monitor_id|
+ monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(monitor_id, [cluster_id, node])
+ new_monitor = HealthMonitorRecord.new(
+ monitor_id,
+ monitor_instance_id,
+ Time.now.utc.iso8601,
+ HealthMonitorStates::UNKNOWN,
+ provider.get_node_labels(node),
+ {},
+ {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => "no signal received from node #{node}"}
+ )
+ missing_signals_map[monitor_instance_id] = new_monitor
+ log.info "Added missing signal when node_signals_hash was not empty #{new_monitor.monitor_instance_id} #{new_monitor.state}"
+ }
+ }
+ end
+
+ missing_signals_map.each{|k,v|
+ missing_signals.push(v)
+ }
+
+ # if an unknown signal is present neither in missing signals or the incoming signals, change its state to none, and remove from unknown_signals
+ # in update_state of HealthMonitorState, send if latest_record_state is none
+ @unknown_signals_hash.each{|k,v|
+ if !missing_signals_map.key?(k) && !health_monitor_records_map.key?(k)
+ monitor_record = @unknown_signals_hash[k]
+ monitor_record.details["state"] = HealthMonitorStates::NONE # used for calculating the old and new states in update_state
+ monitor_record.state = HealthMonitorStates::NONE #used for calculating the aggregate monitor state
+ missing_signals.push(monitor_record)
+ @unknown_signals_hash.delete(k)
+ log.info "Updating state from unknown to none for #{k}"
+ end
+ }
+ return missing_signals
+ end
+
+ def update_last_received_records(last_received_records)
+ last_received_records_map = {}
+ last_received_records.map {|record| last_received_records_map[record.monitor_instance_id] = record }
+ @last_received_records = last_received_records_map
+ end
+ end
+
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_model_buffer.rb b/source/code/plugin/health/health_model_buffer.rb
new file mode 100644
index 000000000..1ccfe7349
--- /dev/null
+++ b/source/code/plugin/health/health_model_buffer.rb
@@ -0,0 +1,29 @@
+module HealthModel
+
+=begin
+ Class that is used to create a buffer for collecting the health records
+=end
+ class HealthModelBuffer
+
+ attr_reader :records_buffer, :log
+
+ def initialize
+ @records_buffer = []
+ end
+
+ # Returns the current buffer
+ def get_buffer
+ return @records_buffer
+ end
+
+ # adds records to the buffer
+ def add_to_buffer(records)
+ @records_buffer.push(*records)
+ end
+
+ # clears/resets the buffer
+ def reset_buffer
+ @records_buffer = []
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb
new file mode 100644
index 000000000..4cf802798
--- /dev/null
+++ b/source/code/plugin/health/health_model_builder.rb
@@ -0,0 +1,37 @@
+require_relative 'health_model_constants'
+require 'time'
+
+module HealthModel
+ class HealthModelBuilder
+ attr_accessor :hierarchy_builder, :state_finalizers, :monitor_set
+
+ def initialize(hierarchy_builder, state_finalizers, monitor_set)
+ @hierarchy_builder = hierarchy_builder
+ @state_finalizers = state_finalizers
+ @monitor_set = monitor_set
+ end
+
+ def process_records(health_records)
+ health_records.each{|health_record|
+ @hierarchy_builder.process_record(health_record, @monitor_set)
+ }
+ end
+
+ def finalize_model
+ if !@state_finalizers.is_a?(Array)
+ raise 'state finalizers should be an array'
+ end
+
+ if @state_finalizers.length == 0
+ raise '@state_finalizers length should not be zero or empty'
+ end
+
+ @state_finalizers.each{|finalizer|
+ finalizer.finalize(@monitor_set)
+ }
+
+ return @monitor_set.get_map
+ end
+
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb
new file mode 100644
index 000000000..82ae569f3
--- /dev/null
+++ b/source/code/plugin/health/health_model_constants.rb
@@ -0,0 +1,81 @@
+module HealthModel
+ class MonitorState
+ CRITICAL = "fail"
+ ERROR = "err"
+ WARNING = "warn"
+ NONE = "none"
+ HEALTHY = "pass"
+ UNKNOWN = "unknown"
+ end
+
+ class AggregationAlgorithm
+ WORSTOF = "worstOf"
+ PERCENTAGE = "percentage"
+ end
+
+ class MonitorId
+ CLUSTER = 'cluster';
+ ALL_NODES = 'all_nodes';
+ K8S_INFRASTRUCTURE = 'k8s_infrastructure'
+
+ NODE = 'node';
+ AGENT_NODE_POOL = 'agent_node_pool'
+ MASTER_NODE_POOL = 'master_node_pool'
+ ALL_AGENT_NODE_POOLS = 'all_agent_node_pools'
+ ALL_NODE_POOLS = 'all_node_pools';
+
+ WORKLOAD = 'all_workloads';
+ CAPACITY = 'capacity';
+
+ USER_WORKLOAD = 'user_workload';
+ SYSTEM_WORKLOAD = 'system_workload'
+ NAMESPACE = 'namespace';
+ end
+
+ class HealthMonitorRecordFields
+ CLUSTER_ID = "ClusterId"
+ MONITOR_ID = "MonitorId"
+ MONITOR_INSTANCE_ID = "MonitorInstanceId"
+ MONITOR_LABELS = "MonitorLabels"
+ DETAILS = "Details"
+ MONITOR_CONFIG = "MonitorConfig"
+ OLD_STATE = "OldState"
+ NEW_STATE = "NewState"
+ AGENT_COLLECTION_TIME = "AgentCollectionTime"
+ TIME_FIRST_OBSERVED = "TimeFirstObserved"
+ NODE_NAME = "NodeName"
+ NAMESPACE = "Namespace"
+ end
+
+ class HealthMonitorConstants
+ NODE_CPU_MONITOR_ID = "node_cpu_utilization"
+ NODE_MEMORY_MONITOR_ID = "node_memory_utilization"
+ CONTAINER_CPU_MONITOR_ID = "container_cpu_utilization"
+ CONTAINER_MEMORY_MONITOR_ID = "container_memory_utilization"
+ NODE_CONDITION_MONITOR_ID = "node_condition"
+ WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_cpu"
+ WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_memory"
+ WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID = "container_cpu_utilization"
+ WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID = "container_memory_utilization"
+ KUBE_API_STATUS = "kube_api_status"
+ USER_WORKLOAD_PODS_READY_MONITOR_ID = "user_workload_pods_ready"
+ SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID = "system_workload_pods_ready"
+ end
+
+ class HealthMonitorStates
+ PASS = "pass"
+ FAIL = "fail"
+ WARNING = "warn"
+ NONE = "none"
+ UNKNOWN = "unknown"
+ end
+
+ class HealthMonitorLabels
+ WORKLOAD_NAME = "container.azm.ms/workload-name"
+ WORKLOAD_KIND = "container.azm.ms/workload-kind"
+ NAMESPACE = "container.azm.ms/namespace"
+ AGENTPOOL = "agentpool"
+ ROLE = "kubernetes.io/role"
+ HOSTNAME = "kubernetes.io/hostname"
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb
new file mode 100644
index 000000000..f6c7a781d
--- /dev/null
+++ b/source/code/plugin/health/health_model_definition_parser.rb
@@ -0,0 +1,50 @@
+=begin
+ Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor,
+ and what labels to "pass on" to the parent monitor
+=end
+require 'json'
+
+module HealthModel
+ class HealthModelDefinitionParser
+ attr_accessor :health_model_definition_path, :health_model_definition
+
+ # Constructor
+ def initialize(path)
+ @health_model_definition = {}
+ @health_model_definition_path = path
+ end
+
+ # Parse the health model definition file and build the model roll-up hierarchy
+ def parse_file
+ if (!File.exist?(@health_model_definition_path))
+ raise "File does not exist in the specified path"
+ end
+
+ file = File.read(@health_model_definition_path)
+ temp_model = JSON.parse(file)
+ temp_model.each { |entry|
+ monitor_id = entry['monitor_id']
+ parent_monitor_id = entry['parent_monitor_id']
+ labels = entry['labels'] if entry['labels']
+ aggregation_algorithm = entry['aggregation_algorithm'] if entry['aggregation_algorithm']
+ aggregation_algorithm_params = entry['aggregation_algorithm_params'] if entry['aggregation_algorithm_params']
+ if parent_monitor_id.is_a?(Array)
+ conditions = []
+ parent_monitor_id.each{|condition|
+ key = condition['label']
+ operator = condition['operator']
+ value = condition['value']
+ parent_id = condition['id']
+ conditions.push({"key" => key, "operator" => operator, "value" => value, "parent_id" => parent_id})
+ }
+ @health_model_definition[monitor_id] = {"conditions" => conditions, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params}
+ elsif parent_monitor_id.is_a?(String)
+ @health_model_definition[monitor_id] = {"parent_monitor_id" => parent_monitor_id, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params}
+ elsif parent_monitor_id.nil?
+ @health_model_definition[monitor_id] = {"parent_monitor_id" => nil, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params}
+ end
+ }
+ @health_model_definition
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb
new file mode 100644
index 000000000..9e2977a0e
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_helpers.rb
@@ -0,0 +1,36 @@
+require 'logger'
+require 'digest'
+
+module HealthModel
+ # static class that provides a bunch of utility methods
+ class HealthMonitorHelpers
+
+ @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log"
+
+ if Gem.win_platform? #unit testing on windows dev machine
+ @log_path = "C:\Temp\health_monitors.log"
+ end
+
+ @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
+
+ class << self
+ def is_node_monitor(monitor_id)
+ return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID)
+ end
+
+ def is_pods_ready_monitor(monitor_id)
+ return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID)
+ end
+
+ def get_log_handle
+ return @log
+ end
+
+ def get_monitor_instance_id(monitor_id, args = [])
+ string_to_hash = args.join("/")
+ return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}"
+ end
+ end
+
+ end
+end
diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb
new file mode 100644
index 000000000..b33c8a986
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_optimizer.rb
@@ -0,0 +1,52 @@
+module HealthModel
+ class HealthMonitorOptimizer
+ #ctor
+ def initialize
+ @@health_signal_timeout = 240
+ @@first_record_sent = {}
+ end
+
+ def should_send(monitor_instance_id, health_monitor_state, health_monitor_config)
+
+ health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id)
+ health_monitor_records = health_monitor_instance_state.prev_records
+ health_monitor_config['ConsecutiveSamplesForStateTransition'].nil? ? samples_to_check = 1 : samples_to_check = health_monitor_config['ConsecutiveSamplesForStateTransition'].to_i
+
+ latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning
+ latest_record_state = latest_record["state"]
+ latest_record_time = latest_record["timestamp"] #string representation of time
+
+ new_state = health_monitor_instance_state.new_state
+ prev_sent_time = health_monitor_instance_state.prev_sent_record_time
+ time_first_observed = health_monitor_instance_state.state_change_time
+
+ if latest_record_state.downcase == new_state.downcase
+ time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60
+ if time_elapsed > @@health_signal_timeout # minutes
+ return true
+ elsif !@@first_record_sent.key?(monitor_instance_id)
+ @@first_record_sent[monitor_instance_id] = true
+ return true
+ else
+ return false
+ end
+ else
+ if samples_to_check == 1
+ return true
+ elsif health_monitor_instance_state.prev_records.size == 1 && samples_to_check > 1
+ return true
+ elsif health_monitor_instance_state.prev_records.size < samples_to_check
+ return false
+ else
+ # state change from previous sent state to latest record state
+ #check state of last n records to see if they are all in the same state
+ if (health_monitor_instance_state.is_state_change_consistent)
+ return true
+ else
+ return false
+ end
+ end
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb
new file mode 100644
index 000000000..0c1cbf7f2
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_provider.rb
@@ -0,0 +1,123 @@
+module HealthModel
+ class HealthMonitorProvider
+
+ attr_accessor :cluster_labels, :health_kubernetes_resources, :monitor_configuration_path, :cluster_id
+ attr_reader :monitor_configuration
+
+ def initialize(cluster_id, cluster_labels, health_kubernetes_resources, monitor_configuration_path)
+ @cluster_labels = Hash.new
+ cluster_labels.each{|k,v| @cluster_labels[k] = v}
+ @cluster_id = cluster_id
+ @health_kubernetes_resources = health_kubernetes_resources
+ @monitor_configuration_path = monitor_configuration_path
+ begin
+ @monitor_configuration = {}
+ file = File.open(@monitor_configuration_path, "r")
+ if !file.nil?
+ fileContents = file.read
+ @monitor_configuration = JSON.parse(fileContents)
+ file.close
+ end
+ rescue => e
+ @log.info "Error when opening health config file #{e}"
+ end
+ end
+
+ def get_record(health_monitor_record, health_monitor_state)
+
+ labels = Hash.new
+ @cluster_labels.each{|k,v| labels[k] = v}
+ monitor_id = health_monitor_record.monitor_id
+ monitor_instance_id = health_monitor_record.monitor_instance_id
+ health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id)
+
+
+ monitor_labels = health_monitor_record.labels
+ if !monitor_labels.empty?
+ monitor_labels.keys.each do |key|
+ labels[key] = monitor_labels[key]
+ end
+ end
+
+ prev_records = health_monitor_instance_state.prev_records
+ time_first_observed = health_monitor_instance_state.state_change_time # the oldest collection time
+ new_state = health_monitor_instance_state.new_state # this is updated before formatRecord is called
+ old_state = health_monitor_instance_state.old_state
+
+ config = get_config(monitor_id)
+
+ if prev_records.size == 1
+ details = prev_records[0]
+ else
+ details = prev_records
+ end
+
+ time_observed = Time.now.utc.iso8601
+
+ monitor_record = {}
+
+ monitor_record[HealthMonitorRecordFields::CLUSTER_ID] = @cluster_id
+ monitor_record[HealthMonitorRecordFields::MONITOR_LABELS] = labels.to_json
+ monitor_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ monitor_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ monitor_record[HealthMonitorRecordFields::NEW_STATE] = new_state
+ monitor_record[HealthMonitorRecordFields::OLD_STATE] = old_state
+ monitor_record[HealthMonitorRecordFields::DETAILS] = details.to_json
+ monitor_record[HealthMonitorRecordFields::MONITOR_CONFIG] = config.to_json
+ monitor_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = Time.now.utc.iso8601
+ monitor_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_first_observed
+
+ return monitor_record
+ end
+
+ def get_config(monitor_id)
+ if @monitor_configuration.key?(monitor_id)
+ return @monitor_configuration[monitor_id]
+ else
+ return {}
+ end
+ end
+
+ def get_labels(health_monitor_record)
+ monitor_labels = Hash.new
+ @cluster_labels.keys.each{|key|
+ monitor_labels[key] = @cluster_labels[key]
+ }
+ monitor_id = health_monitor_record[HealthMonitorRecordFields::MONITOR_ID]
+ case monitor_id
+ when HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID, HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID
+
+ namespace = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['namespace']
+ workload_name = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadName']
+ workload_kind = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadKind']
+
+ monitor_labels[HealthMonitorLabels::WORKLOAD_NAME] = workload_name.split('~~')[1]
+ monitor_labels[HealthMonitorLabels::WORKLOAD_KIND] = workload_kind
+ monitor_labels[HealthMonitorLabels::NAMESPACE] = namespace
+
+ when HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID
+ node_name = health_monitor_record[HealthMonitorRecordFields::NODE_NAME]
+ @health_kubernetes_resources.get_node_inventory['items'].each do |node|
+ if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name']
+ if !node["metadata"].nil? && !node["metadata"]["labels"].nil?
+ monitor_labels = monitor_labels.merge(node["metadata"]["labels"])
+ end
+ end
+ end
+ end
+ return monitor_labels
+ end
+
+ def get_node_labels(node_name)
+ monitor_labels = {}
+ @health_kubernetes_resources.get_node_inventory['items'].each do |node|
+ if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name']
+ if !node["metadata"].nil? && !node["metadata"]["labels"].nil?
+ monitor_labels = node["metadata"]["labels"]
+ end
+ end
+ end
+ return monitor_labels
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_monitor_record.rb b/source/code/plugin/health/health_monitor_record.rb
new file mode 100644
index 000000000..873736c3a
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_record.rb
@@ -0,0 +1,10 @@
+HealthMonitorRecord = Struct.new(
+ :monitor_id,
+ :monitor_instance_id,
+ :transition_date_time,
+ :state,
+ :labels,
+ :config,
+ :details
+ ) do
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb
new file mode 100644
index 000000000..c3df5e3a9
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_state.rb
@@ -0,0 +1,214 @@
+module HealthModel
+
+ HealthMonitorInstanceState = Struct.new(:prev_sent_record_time, :old_state, :new_state, :state_change_time, :prev_records, :is_state_change_consistent, :should_send) do
+ end
+
+ # Class that is used to store the last sent state and latest monitors
+ # provides services like
+ # get_state -- returns the current state and details
+ # update_instance -- updates the state of the health monitor history records
+ # set_state -- sets the last health monitor state
+ class HealthMonitorState
+
+ def initialize
+ @@monitor_states = {}
+ @@first_record_sent = {}
+ @@health_signal_timeout = 240
+ end
+
+ def get_state(monitor_instance_id)
+ if @@monitor_states.key?(monitor_instance_id)
+ return @@monitor_states[monitor_instance_id]
+ end
+ end
+
+ def set_state(monitor_instance_id, health_monitor_instance_state)
+ @@monitor_states[monitor_instance_id] = health_monitor_instance_state
+ end
+
+ def to_h
+ return @@monitor_states
+ end
+
+ def initialize_state(deserialized_state)
+ @@monitor_states = {}
+ deserialized_state.each{|k,v|
+ health_monitor_instance_state_hash = JSON.parse(v)
+ state = HealthMonitorInstanceState.new(*health_monitor_instance_state_hash.values_at(*HealthMonitorInstanceState.members))
+ state.prev_sent_record_time = health_monitor_instance_state_hash["prev_sent_record_time"]
+ state.old_state = health_monitor_instance_state_hash["old_state"]
+ state.new_state = health_monitor_instance_state_hash["new_state"]
+ state.state_change_time = health_monitor_instance_state_hash["state_change_time"]
+ state.prev_records = health_monitor_instance_state_hash["prev_records"]
+ state.is_state_change_consistent = health_monitor_instance_state_hash["is_state_change_consistent"] || false
+ state.should_send = health_monitor_instance_state_hash["should_send"]
+ @@monitor_states[k] = state
+ @@first_record_sent[k] = true
+
+ }
+ end
+
+=begin
+when do u send?
+---------------
+1. if the signal hasnt been sent before
+2. if there is a "consistent" state change for monitors
+3. if the signal is stale (> 4hrs)
+4. If the latest state is none
+=end
+ def update_state(monitor, #UnitMonitor/AggregateMonitor
+ monitor_config #Hash
+ )
+ samples_to_keep = 1
+ monitor_instance_id = monitor.monitor_instance_id
+ log = HealthMonitorHelpers.get_log_handle
+ current_time = Time.now.utc.iso8601
+ health_monitor_instance_state = get_state(monitor_instance_id)
+ if !health_monitor_instance_state.nil?
+ health_monitor_instance_state.is_state_change_consistent = false
+ health_monitor_instance_state.should_send = false
+ set_state(monitor_instance_id, health_monitor_instance_state) # reset is_state_change_consistent
+ end
+
+ if !monitor_config.nil? && !monitor_config['ConsecutiveSamplesForStateTransition'].nil?
+ samples_to_keep = monitor_config['ConsecutiveSamplesForStateTransition'].to_i
+ end
+
+ if @@monitor_states.key?(monitor_instance_id)
+ health_monitor_instance_state = @@monitor_states[monitor_instance_id]
+ health_monitor_records = health_monitor_instance_state.prev_records #This should be an array
+
+ if health_monitor_records.size == samples_to_keep
+ health_monitor_records.delete_at(0)
+ end
+ health_monitor_records.push(monitor.details)
+ health_monitor_instance_state.prev_records = health_monitor_records
+ @@monitor_states[monitor_instance_id] = health_monitor_instance_state
+ else
+ # if samples_to_keep == 1, then set new state to be the health_monitor_record state, else set it as none
+
+ old_state = HealthMonitorStates::NONE
+ new_state = HealthMonitorStates::NONE
+ if samples_to_keep == 1
+ new_state = monitor.state
+ end
+
+ health_monitor_instance_state = HealthMonitorInstanceState.new(
+ monitor.transition_date_time,
+ old_state,
+ new_state,
+ monitor.transition_date_time,
+ [monitor.details])
+
+ health_monitor_instance_state.should_send = true
+ @@monitor_states[monitor_instance_id] = health_monitor_instance_state
+ end
+
+
+ # update old and new state based on the history and latest record.
+ # TODO: this is a little hairy. Simplify
+
+ health_monitor_records = health_monitor_instance_state.prev_records
+ if monitor_config['ConsecutiveSamplesForStateTransition'].nil?
+ samples_to_check = 1
+ else
+ samples_to_check = monitor_config['ConsecutiveSamplesForStateTransition'].to_i
+ end
+
+ latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning
+ latest_record_state = latest_record["state"]
+ latest_record_time = latest_record["timestamp"] #string representation of time
+
+ new_state = health_monitor_instance_state.new_state
+ prev_sent_time = health_monitor_instance_state.prev_sent_record_time
+
+ # if the last sent state (new_state is different from latest monitor state)
+ if latest_record_state.downcase == new_state.downcase
+ time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60
+ # check if health signal has "timed out"
+ if time_elapsed > @@health_signal_timeout # minutes
+ # update record for last sent record time
+ health_monitor_instance_state.old_state = health_monitor_instance_state.new_state
+ health_monitor_instance_state.new_state = latest_record_state
+ health_monitor_instance_state.prev_sent_record_time = current_time
+ health_monitor_instance_state.should_send = true
+ #log.debug "After Updating Monitor State #{health_monitor_instance_state}"
+ set_state(monitor_instance_id, health_monitor_instance_state)
+ log.debug "#{monitor_instance_id} condition: signal timeout should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}"
+ # check if the first record has been sent
+ elsif !@@first_record_sent.key?(monitor_instance_id)
+ @@first_record_sent[monitor_instance_id] = true
+ health_monitor_instance_state.should_send = true
+ set_state(monitor_instance_id, health_monitor_instance_state)
+ end
+ # latest state is different that last sent state
+ else
+ #if latest_record_state is none, send
+ if latest_record_state.downcase == HealthMonitorStates::NONE
+ health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state
+ health_monitor_instance_state.new_state = latest_record_state
+ health_monitor_instance_state.state_change_time = current_time
+ health_monitor_instance_state.prev_sent_record_time = current_time
+ health_monitor_instance_state.should_send = true
+ if !@@first_record_sent.key?(monitor_instance_id)
+ @@first_record_sent[monitor_instance_id] = true
+ end
+ set_state(monitor_instance_id, health_monitor_instance_state)
+ log.debug "#{monitor_instance_id} condition: NONE state should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}"
+ # if it is a monitor that needs to instantly notify on state change, update the state
+ # mark the monitor to be sent
+ elsif samples_to_check == 1
+ health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state
+ health_monitor_instance_state.new_state = latest_record_state
+ health_monitor_instance_state.state_change_time = current_time
+ health_monitor_instance_state.prev_sent_record_time = current_time
+ health_monitor_instance_state.should_send = true
+ if !@@first_record_sent.key?(monitor_instance_id)
+ @@first_record_sent[monitor_instance_id] = true
+ end
+ set_state(monitor_instance_id, health_monitor_instance_state)
+ log.debug "#{monitor_instance_id} condition: state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}"
+ else
+ # state change from previous sent state to latest record state
+ #check state of last n records to see if they are all in the same state
+ if (is_state_change_consistent(health_monitor_records, samples_to_keep))
+ first_record = health_monitor_records[0]
+ latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning
+ latest_record_state = latest_record["state"]
+ latest_record_time = latest_record["timestamp"] #string representation of time
+
+ health_monitor_instance_state.old_state = health_monitor_instance_state.new_state
+ health_monitor_instance_state.is_state_change_consistent = true # This way it wont be recomputed in the optimizer.
+ health_monitor_instance_state.should_send = true
+ health_monitor_instance_state.new_state = latest_record_state
+ health_monitor_instance_state.prev_sent_record_time = current_time
+ health_monitor_instance_state.state_change_time = current_time
+
+ set_state(monitor_instance_id, health_monitor_instance_state)
+
+ if !@@first_record_sent.key?(monitor_instance_id)
+ @@first_record_sent[monitor_instance_id] = true
+ end
+ log.debug "#{monitor_instance_id} condition: consistent state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}"
+ end
+ end
+ end
+ end
+
+ private
+ def is_state_change_consistent(health_monitor_records, samples_to_check)
+ if health_monitor_records.nil? || health_monitor_records.size == 0 || health_monitor_records.size < samples_to_check
+ return false
+ end
+ i = 0
+ while i < health_monitor_records.size - 1
+ #log.debug "Prev: #{health_monitor_records[i].state} Current: #{health_monitor_records[i + 1].state}"
+ if health_monitor_records[i]["state"] != health_monitor_records[i + 1]["state"]
+ return false
+ end
+ i += 1
+ end
+ return true
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb
new file mode 100644
index 000000000..df47529e6
--- /dev/null
+++ b/source/code/plugin/health/health_monitor_utils.rb
@@ -0,0 +1,369 @@
+require 'logger'
+require 'digest'
+
+module HealthModel
+ # static class that provides a bunch of utility methods
+ class HealthMonitorUtils
+
+ begin
+ if !Gem.win_platform?
+ require_relative '../KubernetesApiClient'
+ end
+ rescue => e
+ $log.info "Error loading KubernetesApiClient #{e.message}"
+ end
+
+ @@node_inventory = []
+
+ @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log"
+
+ if Gem.win_platform? #unit testing on windows dev machine
+ @log_path = "C:\Temp\health_monitors.log"
+ end
+
+ @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
+ @@last_refresh_time = '2019-01-01T00:00:00Z'
+
+ class << self
+ # compute the percentage state given a value and a monitor configuration
+ def compute_percentage_state(value, config)
+
+ if config.nil? || config['WarnThresholdPercentage'].nil?
+ warn_percentage = nil
+ else
+ warn_percentage = config['WarnThresholdPercentage'].to_f
+ end
+ fail_percentage = config['FailThresholdPercentage'].to_f
+
+ if value > fail_percentage
+ return HealthMonitorStates::FAIL
+ elsif !warn_percentage.nil? && value > warn_percentage
+ return HealthMonitorStates::WARNING
+ else
+ return HealthMonitorStates::PASS
+ end
+ end
+
+ def is_node_monitor(monitor_id)
+ return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID)
+ end
+
+ def is_pods_ready_monitor(monitor_id)
+ return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID)
+ end
+
+ def is_cluster_health_model_enabled
+ enabled = ENV["AZMON_CLUSTER_ENABLE_HEALTH_MODEL"]
+ if !enabled.nil? && enabled.casecmp("true") == 0
+ return true
+ else
+ return false
+ end
+ end
+
+ def get_pods_ready_hash(pod_inventory, deployment_inventory)
+ pods_ready_percentage_hash = {}
+ deployment_lookup = {}
+ deployment_inventory['items'].each do |deployment|
+ match_labels = deployment['spec']['selector']['matchLabels'].to_h
+ namespace = deployment['metadata']['namespace']
+ match_labels.each{|k,v|
+ deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}"
+ }
+ end
+ pod_inventory['items'].each do |pod|
+ begin
+ has_owner = !pod['metadata']['ownerReferences'].nil?
+ owner_kind = ''
+ if has_owner
+ owner_kind = pod['metadata']['ownerReferences'][0]['kind']
+ controller_name = pod['metadata']['ownerReferences'][0]['name']
+ else
+ owner_kind = pod['kind']
+ controller_name = pod['metadata']['name']
+ #log.info "#{JSON.pretty_generate(pod)}"
+ end
+
+ namespace = pod['metadata']['namespace']
+ status = pod['status']['phase']
+
+ workload_name = ''
+ if owner_kind.nil?
+ owner_kind = 'Pod'
+ end
+ case owner_kind.downcase
+ when 'job'
+ # we are excluding jobs
+ next
+ when 'replicaset'
+ # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name
+ labels = pod['metadata']['labels'].to_h
+ labels.each {|k,v|
+ lookup_key = "#{namespace}-#{k}=#{v}"
+ if deployment_lookup.key?(lookup_key)
+ workload_name = deployment_lookup[lookup_key]
+ break
+ end
+ }
+ if workload_name.empty?
+ workload_name = "#{namespace}~~#{controller_name}"
+ end
+ when 'daemonset'
+ workload_name = "#{namespace}~~#{controller_name}"
+ else
+ workload_name = "#{namespace}~~#{pod['metadata']['name']}"
+ end
+
+ if pods_ready_percentage_hash.key?(workload_name)
+ total_pods = pods_ready_percentage_hash[workload_name]['totalPods']
+ pods_ready = pods_ready_percentage_hash[workload_name]['podsReady']
+ else
+ total_pods = 0
+ pods_ready = 0
+ end
+
+ total_pods += 1
+ if status == 'Running'
+ pods_ready += 1
+ end
+
+ pods_ready_percentage_hash[workload_name] = {'totalPods' => total_pods, 'podsReady' => pods_ready, 'namespace' => namespace, 'workload_name' => workload_name, 'kind' => owner_kind}
+ rescue => e
+ log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}"
+ end
+ end
+ return pods_ready_percentage_hash
+ end
+
+ def get_node_state_from_node_conditions(node_conditions)
+ pass = false
+ node_conditions.each do |condition|
+ type = condition['type']
+ status = condition['status']
+
+ if ((type == "NetworkUnavailable" || type == "OutOfDisk") && (status == 'True' || status == 'Unknown'))
+ return "fail"
+ elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown'))
+ return "warn"
+ elsif type == "Ready" && status == 'True'
+ pass = true
+ end
+ end
+
+ if pass
+ return "pass"
+ else
+ return "fail"
+ end
+ end
+
+ def get_resource_subscription(pod_inventory, metric_name, metric_capacity)
+ subscription = 0.0
+ if !pod_inventory.empty?
+ pod_inventory['items'].each do |pod|
+ pod['spec']['containers'].each do |container|
+ if !container['resources']['requests'].nil? && !container['resources']['requests'][metric_name].nil?
+ subscription += KubernetesApiClient.getMetricNumericValue(metric_name, container['resources']['requests'][metric_name])
+ end
+ end
+ end
+ end
+ #log.debug "#{metric_name} Subscription #{subscription}"
+ return subscription
+ end
+
+ def get_cluster_cpu_memory_capacity(log)
+ begin
+ node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body)
+ cluster_cpu_capacity = 0.0
+ cluster_memory_capacity = 0.0
+ if !node_inventory.empty?
+ node_inventory['items'].each do |node|
+ cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores")
+ if !cpu_capacity_json.nil?
+ cpu_capacity_json.each do |cpu_capacity_node|
+ if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil?
+ cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value']
+ end
+ end
+ log.info "Cluster CPU Limit #{cluster_cpu_capacity}"
+ else
+ log.info "Error getting cpu_capacity"
+ end
+ memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes")
+ if !memory_capacity_json.nil?
+ memory_capacity_json.each do |memory_capacity_node|
+ if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil?
+ cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value']
+ end
+ end
+ log.info "Cluster Memory Limit #{cluster_memory_capacity}"
+ else
+ log.info "Error getting memory_capacity"
+ end
+ end
+ else
+ log.info "Unable to get cpu and memory capacity"
+ return [0.0, 0.0]
+ end
+ return [cluster_cpu_capacity, cluster_memory_capacity]
+ rescue => e
+ log.info e
+ end
+ end
+
+ def refresh_kubernetes_api_data(log, hostName, force: false)
+ #log.debug "refresh_kubernetes_api_data"
+ if ( ((Time.now.utc - Time.parse(@@last_refresh_time)) / 60 ) < 5.0 && !force)
+ log.debug "Less than 5 minutes since last refresh at #{@@last_refresh_time}"
+ return
+ end
+ if force
+ log.debug "Force Refresh"
+ end
+
+ begin
+ @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body)
+ if !hostName.nil?
+ podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=spec.nodeName%3D#{hostName}").body)
+ else
+ podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body)
+ end
+ podInventory['items'].each do |pod|
+ has_owner = !pod['metadata']['ownerReferences'].nil?
+ if !has_owner
+ workload_name = pod['metadata']['name']
+ else
+ workload_name = pod['metadata']['ownerReferences'][0]['name']
+ end
+ namespace = pod['metadata']['namespace']
+ #TODO: Figure this out for container cpu/memory
+ #@@controllerMapping[workload_name] = namespace
+ #log.debug "workload_name #{workload_name} namespace #{namespace}"
+ pod['spec']['containers'].each do |container|
+ key = [pod['metadata']['uid'], container['name']].join('/')
+
+ if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['cpu'].nil?
+ cpu_limit_value = KubernetesApiClient.getMetricNumericValue('cpu', container['resources']['limits']['cpu'])
+ else
+ log.info "CPU limit not set for container : #{container['name']}. Using Node Capacity"
+ #TODO: Send warning health event #bestpractices
+ cpu_limit_value = @cpu_capacity
+ end
+
+ if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['memory'].nil?
+ #log.info "Raw Memory Value #{container['resources']['limits']['memory']}"
+ memory_limit_value = KubernetesApiClient.getMetricNumericValue('memory', container['resources']['limits']['memory'])
+ else
+ log.info "Memory limit not set for container : #{container['name']}. Using Node Capacity"
+ memory_limit_value = @memory_capacity
+ end
+
+ #TODO: Figure this out for container cpu/memory
+ #@@containerMetadata[key] = {"cpuLimit" => cpu_limit_value, "memoryLimit" => memory_limit_value, "controllerName" => workload_name, "namespace" => namespace}
+ end
+ end
+ rescue => e
+ log.info "Error Refreshing Container Resource Limits #{e.backtrace}"
+ end
+ # log.info "Controller Mapping #{@@controllerMapping}"
+ # log.info "Node Inventory #{@@nodeInventory}"
+ # log.info "Container Metadata #{@@containerMetadata}"
+ # log.info "------------------------------------"
+ @@last_refresh_time = Time.now.utc.iso8601
+ end
+
+ def get_monitor_instance_id(monitor_id, args = [])
+ string_to_hash = args.join("/")
+ return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}"
+ end
+
+ def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname)
+
+ log.info "ensure_cpu_memory_capacity_set cpu_capacity #{cpu_capacity} memory_capacity #{memory_capacity}"
+ if cpu_capacity != 0.0 && memory_capacity != 0.0
+ log.info "CPU And Memory Capacity are already set"
+ return [cpu_capacity, memory_capacity]
+ end
+
+ begin
+ @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body)
+ rescue Exception => e
+ log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} "
+ ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace)
+ end
+ if !@@nodeInventory.nil?
+ cpu_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores")
+ if !cpu_capacity_json.nil?
+ cpu_capacity_json.each do |cpu_info_node|
+ if !cpu_info_node['DataItems'][0]['Host'].nil? && cpu_info_node['DataItems'][0]['Host'] == hostname
+ if !cpu_info_node['DataItems'][0]['Collections'][0]['Value'].nil?
+ cpu_capacity = cpu_info_node['DataItems'][0]['Collections'][0]['Value']
+ end
+ end
+ end
+ log.info "CPU Limit #{cpu_capacity}"
+ else
+ log.info "Error getting cpu_capacity"
+ end
+ memory_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "memory", "memoryCapacityBytes")
+ if !memory_capacity_json.nil?
+ memory_capacity_json.each do |memory_info_node|
+ if !memory_info_node['DataItems'][0]['Host'].nil? && memory_info_node['DataItems'][0]['Host'] == hostname
+ if !memory_info_node['DataItems'][0]['Collections'][0]['Value'].nil?
+ memory_capacity = memory_info_node['DataItems'][0]['Collections'][0]['Value']
+ end
+ end
+ end
+ log.info "memory Limit #{memory_capacity}"
+ else
+ log.info "Error getting memory_capacity"
+ end
+ return [cpu_capacity, memory_capacity]
+ end
+ end
+
+ def build_metrics_hash(metrics_to_collect)
+ metrics_to_collect_arr = metrics_to_collect.split(',').map(&:strip)
+ metrics_hash = metrics_to_collect_arr.map {|x| [x.downcase,true]}.to_h
+ return metrics_hash
+ end
+
+ def get_health_monitor_config
+ health_monitor_config = {}
+ begin
+ file = File.open('/opt/microsoft/omsagent/plugin/healthmonitorconfig.json', "r")
+ if !file.nil?
+ fileContents = file.read
+ health_monitor_config = JSON.parse(fileContents)
+ file.close
+ end
+ rescue => e
+ log.info "Error when opening health config file #{e}"
+ end
+ return health_monitor_config
+ end
+
+ def get_cluster_labels
+ labels = {}
+ cluster_id = KubernetesApiClient.getClusterId
+ region = KubernetesApiClient.getClusterRegion
+ labels['container.azm.ms/cluster-region'] = region
+ if !cluster_id.nil?
+ cluster_id_elements = cluster_id.split('/')
+ azure_sub_id = cluster_id_elements[2]
+ resource_group = cluster_id_elements[4]
+ cluster_name = cluster_id_elements[8]
+ labels['container.azm.ms/cluster-subscription-id'] = azure_sub_id
+ labels['container.azm.ms/cluster-resource-group'] = resource_group
+ labels['container.azm.ms/cluster-name'] = cluster_name
+ end
+ return labels
+ end
+
+ def get_log_handle
+ return @log
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb
new file mode 100644
index 000000000..4cf53e82c
--- /dev/null
+++ b/source/code/plugin/health/health_signal_reducer.rb
@@ -0,0 +1,51 @@
+module HealthModel
+ # this class
+ # 1. dedupes daemonset signals and takes only the latest
+ # 2. removes signals for objects that are no longer in the inventory e.g. node might have sent signal before being scaled down
+ class HealthSignalReducer
+ def initialize
+
+ end
+
+ def reduce_signals(health_monitor_records, health_k8s_inventory)
+ nodes = health_k8s_inventory.get_nodes
+ workload_names = health_k8s_inventory.get_workload_names
+ reduced_signals_map = {}
+ reduced_signals = []
+ health_monitor_records.each{|health_monitor_record|
+ monitor_instance_id = health_monitor_record.monitor_instance_id
+ monitor_id = health_monitor_record.monitor_id
+ if reduced_signals_map.key?(monitor_instance_id)
+ record = reduced_signals_map[monitor_instance_id]
+ if health_monitor_record.transition_date_time > record.transition_date_time # always take the latest record for a monitor instance id
+ puts 'Duplicate Daemon Set signal'
+ reduced_signals_map[monitor_instance_id] = health_monitor_record
+ end
+ elsif HealthMonitorHelpers.is_node_monitor(monitor_id)
+ node_name = health_monitor_record.labels['kubernetes.io/hostname']
+ if (node_name.nil? || !nodes.include?(node_name)) # only add daemon set records if node is present in the inventory
+ next
+ end
+ reduced_signals_map[monitor_instance_id] = health_monitor_record
+ elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor_id)
+ workload_name = health_monitor_record.labels[HealthMonitorLabels::WORKLOAD_NAME]
+ namespace = health_monitor_record.labels[HealthMonitorLabels::NAMESPACE]
+ lookup = "#{namespace}~~#{workload_name}"
+ if (workload_name.nil? || !workload_names.include?(lookup)) #only add pod record if present in the inventory
+ next
+ end
+ reduced_signals_map[monitor_instance_id] = health_monitor_record
+ else
+ reduced_signals_map[monitor_instance_id] = health_monitor_record
+ end
+ }
+
+ reduced_signals_map.each{|k,v|
+ reduced_signals.push(v)
+ }
+
+ return reduced_signals
+ end
+
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb
new file mode 100644
index 000000000..e6ec9d2c3
--- /dev/null
+++ b/source/code/plugin/health/monitor_factory.rb
@@ -0,0 +1,28 @@
+module HealthModel
+ class MonitorFactory
+
+ def initialize
+
+ end
+
+ def create_unit_monitor(monitor_record)
+ return UnitMonitor.new(monitor_record.monitor_id,
+ monitor_record.monitor_instance_id,
+ monitor_record.state,
+ monitor_record.transition_date_time,
+ monitor_record.labels,
+ monitor_record.config,
+ monitor_record.details)
+ end
+
+ def create_aggregate_monitor(monitor_id, monitor_instance_id, labels, aggregation_algorithm, aggregation_algorithm_params, child_monitor)
+ return AggregateMonitor.new(monitor_id,
+ monitor_instance_id,
+ child_monitor.state,
+ child_monitor.transition_date_time,
+ aggregation_algorithm,
+ aggregation_algorithm_params,
+ labels)
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/monitor_set.rb b/source/code/plugin/health/monitor_set.rb
new file mode 100644
index 000000000..8d5994419
--- /dev/null
+++ b/source/code/plugin/health/monitor_set.rb
@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+
+module HealthModel
+ class MonitorSet
+ attr_accessor :monitors
+
+ #constructor
+ def initialize
+ @monitors = {}
+ end
+
+ # checks if the monitor is present in the set
+ def contains?(monitor_instance_id)
+ @monitors.key?(monitor_instance_id)
+ end
+
+ # adds or updates the monitor
+ def add_or_update(monitor)
+ @monitors[monitor.monitor_instance_id] = monitor
+ end
+
+ # gets the monitor given the monitor instance id
+ def get_monitor(monitor_instance_id)
+ @monitors[monitor_instance_id] if @monitors.key?(monitor_instance_id)
+ end
+
+ # deletes a monitor from the set
+ def delete(monitor_instance_id)
+ if @monitors.key?(monitor_instance_id)
+ @monitors.delete(monitor_instance_id)
+ end
+ end
+
+ # gets the size of the monitor set
+ def get_size
+ @monitors.length
+ end
+
+ # gets the map of monitor instance id to monitors
+ def get_map
+ @monitors
+ end
+ end
+end
diff --git a/source/code/plugin/health/node_monitor_hierarchy_reducer.rb b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb
new file mode 100644
index 000000000..aafbd07a8
--- /dev/null
+++ b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module HealthModel
+ class NodeMonitorHierarchyReducer
+ def initialize
+ end
+
+ # Finalizes the Node Hierarchy. This removes node pools and node pool set from the hierarchy if they are not present.
+ def finalize(monitor_set)
+ monitors_to_reduce = [MonitorId::ALL_AGENT_NODE_POOLS, MonitorId::ALL_NODES]
+ # for the above monitors, which are constant per cluster, the monitor_id and monitor_instance_id are the same
+ monitors_to_reduce.each do |monitor_to_reduce|
+ monitor = monitor_set.get_monitor(monitor_to_reduce)
+ if !monitor.nil?
+ if monitor.is_aggregate_monitor && monitor.get_member_monitors.size == 1
+ #copy the children of member monitor as children of parent
+ member_monitor_instance_id = monitor.get_member_monitors[0] #gets the only member monitor instance id
+ member_monitor = monitor_set.get_monitor(member_monitor_instance_id)
+ #reduce only if the aggregation algorithms are the same
+ if !member_monitor.aggregation_algorithm.nil? && member_monitor.aggregation_algorithm == AggregationAlgorithm::WORSTOF && monitor.aggregation_algorithm == member_monitor.aggregation_algorithm
+ member_monitor.get_member_monitors.each{|grandchild_monitor|
+ monitor.add_member_monitor(grandchild_monitor)
+ }
+ monitor.remove_member_monitor(member_monitor_instance_id)
+ # delete the member monitor from the monitor_set
+ monitor_set.delete(member_monitor_instance_id)
+ end
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb
new file mode 100644
index 000000000..6a27f11d8
--- /dev/null
+++ b/source/code/plugin/health/parent_monitor_provider.rb
@@ -0,0 +1,86 @@
+module HealthModel
+ class ParentMonitorProvider
+
+ attr_reader :health_model_definition, :parent_monitor_mapping, :parent_monitor_instance_mapping
+
+ def initialize(definition)
+ @health_model_definition = definition
+ @parent_monitor_mapping = {} #monitorId --> parent_monitor_id mapping
+ @parent_monitor_instance_mapping = {} #child monitor id -- > parent monitor instance mapping. Used in instances when the node no longer exists and impossible to compute from kube api results
+ end
+
+ # gets the parent monitor id given the state transition. It requires the monitor id and labels to determine the parent id
+ def get_parent_monitor_id(monitor)
+ monitor_id = monitor.monitor_id
+
+ # cache the parent monitor id so it is not recomputed every time
+ if @parent_monitor_mapping.key?(monitor.monitor_instance_id)
+ return @parent_monitor_mapping[monitor.monitor_instance_id]
+ end
+
+ if @health_model_definition.key?(monitor_id)
+ parent_monitor_id = @health_model_definition[monitor_id]['parent_monitor_id']
+ # check parent_monitor_id is an array, then evaluate the conditions, else return the parent_monitor_id
+ if parent_monitor_id.is_a?(String)
+ @parent_monitor_mapping[monitor.monitor_instance_id] = parent_monitor_id
+ return parent_monitor_id
+ end
+ if parent_monitor_id.nil?
+ conditions = @health_model_definition[monitor_id]['conditions']
+ if !conditions.nil? && conditions.is_a?(Array)
+ labels = monitor.labels
+ conditions.each{|condition|
+ left = "#{labels[condition['key']]}"
+ op = "#{condition['operator']}"
+ right = "#{condition['value']}"
+ cond = left.send(op.to_sym, right)
+
+ if cond
+ @parent_monitor_mapping[monitor.monitor_instance_id] = condition['parent_id']
+ return condition['parent_id']
+ end
+ }
+ end
+ raise "Conditions were not met to determine the parent monitor id" if monitor_id != MonitorId::CLUSTER
+ end
+ else
+ raise "Invalid Monitor Id #{monitor_id} in get_parent_monitor_id"
+ end
+ end
+
+ def get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id)
+ labels_to_copy = @health_model_definition[monitor_id]['labels']
+ if labels_to_copy.nil?
+ return {}
+ end
+ parent_monitor_labels = {}
+ labels_to_copy.each{|label|
+ parent_monitor_labels[label] = monitor_labels[label]
+ }
+ return parent_monitor_labels
+ end
+
+ def get_parent_monitor_config(parent_monitor_id)
+ return @health_model_definition[parent_monitor_id]
+ end
+
+ def get_parent_monitor_instance_id(monitor_instance_id, parent_monitor_id, parent_monitor_labels)
+ if @parent_monitor_instance_mapping.key?(monitor_instance_id)
+ return @parent_monitor_instance_mapping[monitor_instance_id]
+ end
+
+ labels = AggregateMonitorInstanceIdLabels.get_labels_for(parent_monitor_id)
+ if !labels.is_a?(Array)
+ raise "Expected #{labels} to be an Array for #{parent_monitor_id}"
+ end
+ values = labels.map{|label| parent_monitor_labels[label]}
+ if values.nil? || values.empty? || values.size == 0
+ @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_id
+ return parent_monitor_id
+ end
+ parent_monitor_instance_id = "#{parent_monitor_id}-#{values.join('-')}"
+ @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_instance_id
+ return parent_monitor_instance_id
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb
new file mode 100644
index 000000000..9af599321
--- /dev/null
+++ b/source/code/plugin/health/unit_monitor.rb
@@ -0,0 +1,26 @@
+require_relative 'health_model_constants'
+require 'json'
+
+module HealthModel
+ class UnitMonitor
+
+ attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :labels, :config, :details, :is_aggregate_monitor
+
+ # constructor
+ def initialize(monitor_id, monitor_instance_id, state, transition_date_time, labels, config, details)
+ @monitor_id = monitor_id
+ @monitor_instance_id = monitor_instance_id
+ @transition_date_time = transition_date_time
+ @state = state
+ @labels = labels
+ @config = config
+ @details = details
+ @is_aggregate_monitor = false
+ end
+
+ def get_member_monitors
+ return nil
+ end
+
+ end
+end
\ No newline at end of file
diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb
index f5f65f01b..1702877a2 100644
--- a/source/code/plugin/in_cadvisor_perf.rb
+++ b/source/code/plugin/in_cadvisor_perf.rb
@@ -2,6 +2,7 @@
# frozen_string_literal: true
module Fluent
+
class CAdvisor_Perf_Input < Input
Plugin.register_input("cadvisorperf", self)
@@ -18,6 +19,8 @@ def initialize
config_param :run_interval, :time, :default => "1m"
config_param :tag, :string, :default => "oms.api.cadvisorperf"
config_param :mdmtag, :string, :default => "mdm.cadvisorperf"
+ config_param :nodehealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Node"
+ #config_param :containerhealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Container"
def configure(conf)
super
@@ -51,11 +54,14 @@ def enumerate()
record["DataType"] = "LINUX_PERF_BLOB"
record["IPName"] = "LogManagement"
eventStream.add(time, record) if record
- #router.emit(@tag, time, record) if record
- end
+ #router.emit(@tag, time, record) if record
+ end
router.emit_stream(@tag, eventStream) if eventStream
router.emit_stream(@mdmtag, eventStream) if eventStream
+ #router.emit_stream(@containerhealthtag, eventStream) if eventStream
+ router.emit_stream(@nodehealthtag, eventStream) if eventStream
+
@@istestvar = ENV["ISTEST"]
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}")
diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb
index 3a0e04c67..f177b62bf 100644
--- a/source/code/plugin/in_kube_events.rb
+++ b/source/code/plugin/in_kube_events.rb
@@ -67,7 +67,7 @@ def enumerate(eventList = nil)
newEventQueryState.push(eventId)
if !eventQueryState.empty? && eventQueryState.include?(eventId)
next
- end
+ end
record["ObjectKind"] = items["involvedObject"]["kind"]
record["Namespace"] = items["involvedObject"]["namespace"]
record["Name"] = items["involvedObject"]["name"]
@@ -94,12 +94,12 @@ def enumerate(eventList = nil)
eventStream.add(emitTime, wrapper) if wrapper
end
router.emit_stream(@tag, eventStream) if eventStream
- end
+ end
writeEventQueryState(newEventQueryState)
rescue => errorStr
$log.debug_backtrace(errorStr.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
- end
+ end
end
def run_periodic
diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb
new file mode 100644
index 000000000..d9672da3b
--- /dev/null
+++ b/source/code/plugin/in_kube_health.rb
@@ -0,0 +1,307 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require_relative "KubernetesApiClient"
+require_relative "oms_common"
+require_relative "omslog"
+require_relative "ApplicationInsightsUtility"
+
+module Fluent
+
+ Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file }
+ class KubeHealthInput < Input
+ Plugin.register_input("kubehealth", self)
+
+ config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json'
+
+ @@clusterCpuCapacity = 0.0
+ @@clusterMemoryCapacity = 0.0
+
+ def initialize
+ super
+ require "yaml"
+ require "json"
+
+ @@cluster_id = KubernetesApiClient.getClusterId
+ @resources = HealthKubernetesResources.instance
+ @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path)
+ @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled
+ end
+
+ include HealthModel
+ config_param :run_interval, :time, :default => "1m"
+ config_param :tag, :string, :default => "oms.api.KubeHealth.ReplicaSet"
+
+ def configure(conf)
+ super
+ end
+
+ def start
+ begin
+ if @run_interval
+ @finished = false
+ @condition = ConditionVariable.new
+ @mutex = Mutex.new
+ @thread = Thread.new(&method(:run_periodic))
+
+ @@hmlog = HealthMonitorUtils.get_log_handle
+ @@clusterName = KubernetesApiClient.getClusterName
+ @@clusterRegion = KubernetesApiClient.getClusterRegion
+ cluster_capacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog)
+ @@clusterCpuCapacity = cluster_capacity[0]
+ @@clusterMemoryCapacity = cluster_capacity[1]
+ @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}"
+ if @@cluster_health_model_enabled
+ ApplicationInsightsUtility.sendCustomEvent("in_kube_health Plugin Start", {})
+ end
+ end
+ rescue => e
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"})
+ end
+ end
+
+ def shutdown
+ if @run_interval
+ @mutex.synchronize {
+ @finished = true
+ @condition.signal
+ }
+ @thread.join
+ end
+ end
+
+ def enumerate
+ begin
+ if !@@cluster_health_model_enabled
+ @@hmlog.info "Cluster Health Model disabled in in_kube_health"
+ return
+ end
+
+ currentTime = Time.now
+ emitTime = currentTime.to_f
+ batchTime = currentTime.utc.iso8601
+ health_monitor_records = []
+ eventStream = MultiEventStream.new
+
+ #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil)
+ # we do this so that if the call fails, we get a response code/header etc.
+ node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes")
+ node_inventory = JSON.parse(node_inventory_response.body)
+ pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods")
+ pod_inventory = JSON.parse(pod_inventory_response.body)
+ deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body)
+
+ @resources.node_inventory = node_inventory
+ @resources.pod_inventory = pod_inventory
+ @resources.deployment_inventory = deployment_inventory
+
+ if node_inventory_response.code.to_i != 200
+ record = process_kube_api_up_monitor("fail", node_inventory_response)
+ health_monitor_records.push(record) if record
+ else
+ record = process_kube_api_up_monitor("pass", node_inventory_response)
+ health_monitor_records.push(record) if record
+ end
+
+ if !pod_inventory.nil?
+ record = process_cpu_oversubscribed_monitor(pod_inventory)
+ health_monitor_records.push(record) if record
+ record = process_memory_oversubscribed_monitor(pod_inventory)
+ health_monitor_records.push(record) if record
+ pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory)
+
+ system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'}
+ workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'}
+
+ system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID)
+ system_pods_ready_percentage_records.each do |record|
+ health_monitor_records.push(record) if record
+ end
+
+ workload_pods_ready_percentage_records = process_pods_ready_percentage(workload_pods, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID)
+ workload_pods_ready_percentage_records.each do |record|
+ health_monitor_records.push(record) if record
+ end
+ else
+ hmlog.info "POD INVENTORY IS NIL"
+ end
+
+ if !node_inventory.nil?
+ node_condition_records = process_node_condition_monitor(node_inventory)
+ node_condition_records.each do |record|
+ health_monitor_records.push(record) if record
+ end
+ else
+ hmlog.info "NODE INVENTORY IS NIL"
+ end
+
+ health_monitor_records.each do |record|
+ eventStream.add(emitTime, record)
+ end
+ router.emit_stream(@tag, eventStream) if eventStream
+ rescue => errorStr
+ @@hmlog.warn("error in_kube_health: #{errorStr.to_s}")
+ @@hmlog.debug "backtrace Input #{errorStr.backtrace}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def process_cpu_oversubscribed_monitor(pod_inventory)
+ timestamp = Time.now.utc.iso8601
+ subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"cpu", @@clusterCpuCapacity)
+ state = subscription > @@clusterCpuCapacity ? "fail" : "pass"
+ #@@hmlog.debug "CPU Oversubscribed Monitor State : #{state}"
+
+ #CPU
+ monitor_id = HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity/1000000.to_f, "clusterCpuRequests" => subscription/1000000.to_f}}
+ # @@hmlog.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id])
+ #hmlog.info "Monitor Instance Id: #{monitor_instance_id}"
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id
+ #@@hmlog.info "Successfully processed process_cpu_oversubscribed_monitor"
+ return health_record
+ end
+
+ def process_memory_oversubscribed_monitor(pod_inventory)
+ timestamp = Time.now.utc.iso8601
+ subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"memory", @@clusterMemoryCapacity)
+ state = subscription > @@clusterMemoryCapacity ? "fail" : "pass"
+ #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}"
+
+ #CPU
+ monitor_id = HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterMemoryCapacity" => @@clusterMemoryCapacity.to_f, "clusterMemoryRequests" => subscription.to_f}}
+ hmlog = HealthMonitorUtils.get_log_handle
+
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id])
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id
+ #@@hmlog.info "Successfully processed process_memory_oversubscribed_monitor"
+ return health_record
+ end
+
+ def process_kube_api_up_monitor(state, response)
+ timestamp = Time.now.utc.iso8601
+
+ monitor_id = HealthMonitorConstants::KUBE_API_STATUS
+ details = response.each_header.to_h
+ details['ResponseCode'] = response.code
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details}
+ hmlog = HealthMonitorUtils.get_log_handle
+ #hmlog.info health_monitor_record
+
+ monitor_instance_id = HealthMonitorConstants::KUBE_API_STATUS
+ #hmlog.info "Monitor Instance Id: #{monitor_instance_id}"
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id
+ #@@hmlog.info "Successfully processed process_kube_api_up_monitor"
+ return health_record
+ end
+
+ def process_pods_ready_percentage(pods_hash, config_monitor_id)
+ monitor_config = @provider.get_config(config_monitor_id)
+ hmlog = HealthMonitorUtils.get_log_handle
+
+ records = []
+ pods_hash.keys.each do |key|
+ workload_name = key
+ total_pods = pods_hash[workload_name]['totalPods']
+ pods_ready = pods_hash[workload_name]['podsReady']
+ namespace = pods_hash[workload_name]['namespace']
+ workload_kind = pods_hash[workload_name]['kind']
+ percent = pods_ready / total_pods * 100
+ timestamp = Time.now.utc.iso8601
+
+ state = HealthMonitorUtils.compute_percentage_state((100-percent), monitor_config)
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workloadName" => workload_name, "namespace" => namespace, "workloadKind" => workload_kind}}
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name])
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id
+ records.push(health_record)
+ end
+ #@@hmlog.info "Successfully processed pods_ready_percentage for #{config_monitor_id} #{records.size}"
+ return records
+ end
+
+ def process_node_condition_monitor(node_inventory)
+ monitor_id = HealthMonitorConstants::NODE_CONDITION_MONITOR_ID
+ timestamp = Time.now.utc.iso8601
+ monitor_config = @provider.get_config(monitor_id)
+ node_condition_monitor_records = []
+ if !node_inventory.nil?
+ node_inventory['items'].each do |node|
+ node_name = node['metadata']['name']
+ conditions = node['status']['conditions']
+ state = HealthMonitorUtils.get_node_state_from_node_conditions(conditions)
+ #hmlog.debug "Node Name = #{node_name} State = #{state}"
+ details = {}
+ conditions.each do |condition|
+ details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message']}
+ end
+ health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details}
+ monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name])
+ health_record = {}
+ time_now = Time.now.utc.iso8601
+ health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id
+ health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id
+ health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record
+ health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now
+ health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now
+ health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id
+ health_record[HealthMonitorRecordFields::NODE_NAME] = node_name
+ node_condition_monitor_records.push(health_record)
+ end
+ end
+ #@@hmlog.info "Successfully processed process_node_condition_monitor #{node_condition_monitor_records.size}"
+ return node_condition_monitor_records
+ end
+
+ def run_periodic
+ @mutex.lock
+ done = @finished
+ until done
+ @condition.wait(@mutex, @run_interval)
+ done = @finished
+ @mutex.unlock
+ if !done
+ begin
+ @@hmlog.info("in_kube_health::run_periodic @ #{Time.now.utc.iso8601}")
+ enumerate
+ rescue => errorStr
+ @@hmlog.warn "in_kube_health::run_periodic: enumerate Failed for kubeapi sourced data health: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ @mutex.lock
+ end
+ @mutex.unlock
+ end
+ end
+end
diff --git a/test/code/plugin/filter_health_model_builder_test.rb b/test/code/plugin/filter_health_model_builder_test.rb
new file mode 100644
index 000000000..f4dba11ed
--- /dev/null
+++ b/test/code/plugin/filter_health_model_builder_test.rb
@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+
+require 'test/unit'
+require 'json'
+# require_relative '../../../source/code/plugin/health'
+
+Dir[File.join(__dir__, '../../../source/code/plugin/health', '*.rb')].each { |file| require file }
+
+class FilterHealthModelBuilderTest < Test::Unit::TestCase
+ include HealthModel
+
+ def test_event_stream
+ health_definition_path = 'C:\AzureMonitor\ContainerInsights\Docker-Provider\installer\conf\health_model_definition.json'
+ health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file)
+ monitor_factory = MonitorFactory.new
+ hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory)
+ # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side
+ state_finalizers = [AggregateMonitorStateFinalizer.new]
+ monitor_set = MonitorSet.new
+ model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set)
+
+ i = 1
+ loop do
+ mock_data_path = "C:/AzureMonitor/ContainerInsights/Docker-Provider/source/code/plugin/mock_data-#{i}.json"
+ file = File.read(mock_data_path)
+ data = JSON.parse(file)
+
+ health_monitor_records = []
+ data.each do |record|
+ health_monitor_record = HealthMonitorRecord.new(
+ record[HealthMonitorRecordFields::MONITOR_ID],
+ record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID],
+ record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED],
+ record[HealthMonitorRecordFields::DETAILS]["state"],
+ record[HealthMonitorRecordFields::MONITOR_LABELS],
+ record[HealthMonitorRecordFields::MONITOR_CONFIG],
+ record[HealthMonitorRecordFields::DETAILS]
+ )
+ state_transitions.push(state_transition)
+ end
+
+ model_builder.process_state_transitions(state_transitions)
+ changed_monitors = model_builder.finalize_model
+ changed_monitors.keys.each{|key|
+ puts key
+ }
+ i = i + 1
+ if i == 6
+ break
+ end
+ end
+ puts "Done"
+ end
+end
diff --git a/test/code/plugin/health/aggregate_monitor_spec.rb b/test/code/plugin/health/aggregate_monitor_spec.rb
new file mode 100644
index 000000000..729965999
--- /dev/null
+++ b/test/code/plugin/health/aggregate_monitor_spec.rb
@@ -0,0 +1,256 @@
+require_relative '../test_helpers'
+
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "AggregateMonitor Spec" do
+ it "is_aggregate_monitor is true for AggregateMonitor" do
+ # Arrange/Act
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {})
+ # Assert
+ assert_equal monitor.is_aggregate_monitor, true
+ end
+
+ it "add_member_monitor tests -- adds a member monitor as a child monitor" do
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {})
+ #Act
+ monitor.add_member_monitor("child_monitor_1")
+ #Assert
+ assert_equal monitor.get_member_monitors.include?("child_monitor_1"), true
+
+ #Act
+ monitor.add_member_monitor("child_monitor_1")
+ #Assert
+ assert_equal monitor.get_member_monitors.size, 1
+ end
+
+ it "remove_member_monitor tests -- removes a member monitor as a child monitor" do
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {})
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+
+ #Act
+ monitor.remove_member_monitor("child_monitor_1")
+ #Assert
+ assert_equal monitor.get_member_monitors.size, 1
+
+ #Act
+ monitor.remove_member_monitor("unknown_child")
+ #Assert
+ assert_equal monitor.get_member_monitors.size, 1
+ end
+
+ it "calculate_details tests -- calculates rollup details based on member monitor states" do
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {})
+
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+
+ #Act
+ monitor.calculate_details(monitor_set)
+ #Assert
+ assert_equal monitor.details["details"], {"pass"=>["child_monitor_1"], "fail"=>["child_monitor_2"]}
+
+ #Arrange
+ child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_3)
+ monitor.add_member_monitor("child_monitor_3")
+
+ #Act
+ monitor.calculate_details(monitor_set)
+ #Assert
+ assert_equal monitor.details["details"], {"pass"=>["child_monitor_1", "child_monitor_3"], "fail"=>["child_monitor_2"]}
+ end
+
+ it "calculate_state tests -- raises when right aggregation_algorithm NOT specified" do
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "", [], {})
+ #Assert
+ assert_raises do
+ monitor.calculate_state(monitor_set)
+ end
+ end
+
+ it "calculate_state tests -- calculate_worst_of_state " do
+ # Arrange -- pass, fail = fail
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {})
+
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "fail"
+
+ #Arrange -- pass, pass = pass
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_2)
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "pass"
+
+ #Arrange -- pass, warn = warn
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "warn", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_2)
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "warn"
+
+ #Arrange -- warn, fail = fail
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "fail"
+
+ #Arrange -- warn, unknown = unknown
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "warn"
+
+ #Arrange -- pass, unknown = unknown
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {})
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "unknown"
+ end
+
+ it "calculate_state tests -- calculate_percentage_state " do
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {})
+
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "fail"
+
+ #Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {})
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "pass"
+
+ #Arrange -- single child monitor
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 33.3}, {})
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor.add_member_monitor("child_monitor_1")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "pass"
+
+
+ #Arrange -- remove none state
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :none, :time, "percentage", {"state_threshold" => 100.0}, {})
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "none", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "pass"
+
+
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {})
+
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {})
+ child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "fail", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+ monitor_set.add_or_update(child_monitor_3)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ monitor.add_member_monitor("child_monitor_3")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "fail"
+
+
+ # Arrange
+ monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {})
+
+ child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {})
+ child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {})
+ child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {})
+
+ monitor_set = MonitorSet.new
+ monitor_set.add_or_update(child_monitor_1)
+ monitor_set.add_or_update(child_monitor_2)
+ monitor_set.add_or_update(child_monitor_3)
+
+ monitor.add_member_monitor("child_monitor_1")
+ monitor.add_member_monitor("child_monitor_2")
+ monitor.add_member_monitor("child_monitor_3")
+ #Act
+ monitor.calculate_state(monitor_set)
+ #Assert
+ assert_equal monitor.state, "pass"
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb
new file mode 100644
index 000000000..f1ae0564d
--- /dev/null
+++ b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb
@@ -0,0 +1,59 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "AggregateMonitorStateFinalizer spec" do
+ it 'computes the right state and details' do
+ #arrange
+ monitor_set = Mock.new
+
+ #mock unit monitors
+ child1 = Mock.new
+ def child1.state; "pass"; end
+ def child1.monitor_id; "child1";end
+ def child1.monitor_instance_id; "child1"; end
+ def child1.nil?; false; end
+ def child1.is_aggregate_monitor; false; end
+
+ child2 = Mock.new
+ def child2.state; "fail"; end
+ def child2.monitor_id; "child2";end
+ def child2.monitor_instance_id; "child2"; end
+ def child2.nil?; false; end
+ def child2.is_aggregate_monitor; false; end
+
+ parent_monitor = AggregateMonitor.new("parent_monitor", "parent_monitor", :none, :time, "worstOf", nil, {})
+ parent_monitor.add_member_monitor("child1")
+ parent_monitor.add_member_monitor("child2")
+
+ top_level_monitor = AggregateMonitor.new("cluster", "cluster", :none, :time, "worstOf", nil, {})
+ top_level_monitor.add_member_monitor("parent_monitor")
+
+ monitor_set.expect(:get_map, {"cluster" => top_level_monitor, "parent_monitor" => parent_monitor, "child1" => child1, "child2" => child2})
+ monitor_set.expect(:get_monitor, top_level_monitor, ["cluster"])
+ monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"])
+ monitor_set.expect(:get_monitor, child1, ["child1"])
+ monitor_set.expect(:get_monitor, child2, ["child2"])
+ monitor_set.expect(:get_monitor, child1, ["child1"])
+ monitor_set.expect(:get_monitor, child2, ["child2"])
+ monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"])
+
+
+ monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"])
+ monitor_set.expect(:get_monitor, child1, ["child1"])
+ monitor_set.expect(:get_monitor, child2, ["child2"])
+
+ #act
+ finalizer = AggregateMonitorStateFinalizer.new
+ finalizer.finalize(monitor_set)
+ #assert
+
+ assert_equal parent_monitor.state, "fail"
+ assert_equal parent_monitor.details, {"details"=>{"pass"=>["child1"], "fail"=>["child2"]}, "state"=>"fail", "timestamp"=>:time}
+
+ assert_equal top_level_monitor.state, "fail"
+ assert_equal top_level_monitor.details, {"details"=>{"fail"=>["parent_monitor"]}, "state"=>"fail", "timestamp"=>:time}
+
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/ca.crt b/test/code/plugin/health/ca.crt
new file mode 100644
index 000000000..9daeafb98
--- /dev/null
+++ b/test/code/plugin/health/ca.crt
@@ -0,0 +1 @@
+test
diff --git a/test/code/plugin/health/cluster_health_state_spec.rb b/test/code/plugin/health/cluster_health_state_spec.rb
new file mode 100644
index 000000000..897291fe2
--- /dev/null
+++ b/test/code/plugin/health/cluster_health_state_spec.rb
@@ -0,0 +1,37 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+require 'time'
+include HealthModel
+include Minitest
+
+describe "Cluster Health State Spec" do
+
+ it "ClusterHealthState.new throws if cert file is NOT present" do
+ state = {
+ "m1" => {
+ "state" => "pass",
+ "time" => Time.now.utc.iso8601
+ }
+ }
+
+ token_file_path = 'token'
+ cert_file_path = '/var/ca.crt'
+
+ proc {ClusterHealthState.new(token_file_path, cert_file_path)}.must_raise
+
+ end
+
+ it "ClusterHealthState.new returns nil if token is NOT present" do
+ state = {
+ "m1" => {
+ "state" => "pass",
+ "time" => Time.now.utc.iso8601
+ }
+ }
+ token_file_path = 'token'
+ cert_file_path = File.join(File.expand_path(File.dirname(__FILE__)), "ca.crt")
+
+ chs = ClusterHealthState.new(token_file_path, cert_file_path)
+ chs.token.must_be_nil
+ end
+end
diff --git a/test/code/plugin/health/health_hierarchy_builder_spec.rb b/test/code/plugin/health/health_hierarchy_builder_spec.rb
new file mode 100644
index 000000000..daafe0312
--- /dev/null
+++ b/test/code/plugin/health/health_hierarchy_builder_spec.rb
@@ -0,0 +1,11 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "HealthHierarchyBuilder spec" do
+ it 'builds right hierarchy given a child monitor and a parent monitor provider' do
+
+ end
+
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb
new file mode 100644
index 000000000..c27d969ec
--- /dev/null
+++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb
@@ -0,0 +1,222 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "HealthKubernetesResources spec" do
+ it "returns the right set of nodes and workloads given node and pod inventory" do
+
+ #arrange
+ nodes_json = '{
+ "items": [
+ {
+ "metadata": {
+ "name": "aks-nodepool1-19574989-0"
+ }
+ },
+ {
+ "metadata": {
+ "name": "aks-nodepool1-19574989-1"
+ }
+ }
+ ]
+ }'
+
+ pods_json = '{
+ "items": [
+ {
+ "metadata": {
+ "name": "diliprdeploymentnodeapps-c4fdfb446-mzcsr",
+ "generateName": "diliprdeploymentnodeapps-c4fdfb446-",
+ "namespace": "default",
+ "selfLink": "/api/v1/namespaces/default/pods/diliprdeploymentnodeapps-c4fdfb446-mzcsr",
+ "uid": "ee31a9ce-526e-11e9-a899-6a5520730c61",
+ "resourceVersion": "4597573",
+ "creationTimestamp": "2019-03-29T22:06:40Z",
+ "labels": {
+ "app": "diliprsnodeapppod",
+ "diliprPodLabel1": "p1",
+ "diliprPodLabel2": "p2",
+ "pod-template-hash": "709896002"
+ },
+ "ownerReferences": [
+ {
+ "apiVersion": "apps/v1",
+ "kind": "ReplicaSet",
+ "name": "diliprdeploymentnodeapps-c4fdfb446",
+ "uid": "ee1e78e0-526e-11e9-a899-6a5520730c61",
+ "controller": true,
+ "blockOwnerDeletion": true
+ }
+ ]
+ },
+ "apiVersion": "v1",
+ "kind": "Pod"
+ },
+ {
+ "metadata": {
+ "name": "pi-m8ccw",
+ "generateName": "pi-",
+ "namespace": "default",
+ "selfLink": "/api/v1/namespaces/default/pods/pi-m8ccw",
+ "uid": "9fb16aaa-7ccc-11e9-8d23-32c49ee6f300",
+ "resourceVersion": "7940877",
+ "creationTimestamp": "2019-05-22T20:03:10Z",
+ "labels": {
+ "controller-uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300",
+ "job-name": "pi"
+ },
+ "ownerReferences": [
+ {
+ "apiVersion": "batch/v1",
+ "kind": "Job",
+ "name": "pi",
+ "uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300",
+ "controller": true,
+ "blockOwnerDeletion": true
+ }
+ ]
+ },
+ "apiVersion": "v1",
+ "kind": "Pod"
+ },
+ {
+ "metadata": {
+ "name": "rss-site",
+ "namespace": "default",
+ "selfLink": "/api/v1/namespaces/default/pods/rss-site",
+ "uid": "68a34ea4-7ce4-11e9-8d23-32c49ee6f300",
+ "resourceVersion": "7954135",
+ "creationTimestamp": "2019-05-22T22:53:26Z",
+ "labels": {
+ "app": "web"
+ },
+ "annotations": {
+ "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"metadata\":{\"annotations\":{},\"labels\":{\"app\":\"web\"},\"name\":\"rss-site\",\"namespace\":\"default\"},\"spec\":{\"containers\":[{\"image\":\"nginx\",\"name\":\"front-end\",\"ports\":[{\"containerPort\":80}]},{\"image\":\"nickchase/rss-php-nginx:v1\",\"name\":\"rss-reader\",\"ports\":[{\"containerPort\":88}]}]}}\n"
+ }
+ },
+ "apiVersion": "v1",
+ "kind": "Pod"
+ },
+ {
+ "metadata": {
+ "name": "kube-proxy-4hjws",
+ "generateName": "kube-proxy-",
+ "namespace": "kube-system",
+ "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-4hjws",
+ "uid": "8cf7c410-88f4-11e9-b1b0-5eb4a3e9de7d",
+ "resourceVersion": "9661065",
+ "creationTimestamp": "2019-06-07T07:19:12Z",
+ "labels": {
+ "component": "kube-proxy",
+ "controller-revision-hash": "1271944371",
+ "pod-template-generation": "16",
+ "tier": "node"
+ },
+ "annotations": {
+ "aks.microsoft.com/release-time": "seconds:1559735217 nanos:797729016 ",
+ "remediator.aks.microsoft.com/kube-proxy-restart": "7"
+ },
+ "ownerReferences": [
+ {
+ "apiVersion": "apps/v1",
+ "kind": "DaemonSet",
+ "name": "kube-proxy",
+ "uid": "45640bf6-44e5-11e9-9920-423525a6b683",
+ "controller": true,
+ "blockOwnerDeletion": true
+ }
+ ]
+ },
+ "apiVersion": "v1",
+ "kind": "Pod"
+ }
+ ]
+ }'
+ deployments_json = '{
+ "items": [
+ {
+ "metadata": {
+ "name": "diliprdeploymentnodeapps",
+ "namespace": "default",
+ "selfLink": "/apis/extensions/v1beta1/namespaces/default/deployments/diliprdeploymentnodeapps",
+ "uid": "ee1b111d-526e-11e9-a899-6a5520730c61",
+ "resourceVersion": "4597575",
+ "generation": 1,
+ "creationTimestamp": "2019-03-29T22:06:40Z",
+ "labels": {
+ "diliprdeploymentLabel1": "d1",
+ "diliprdeploymentLabel2": "d2"
+ },
+ "annotations": {
+ "deployment.kubernetes.io/revision": "1",
+ "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"diliprdeploymentLabel1\":\"d1\",\"diliprdeploymentLabel2\":\"d2\"},\"name\":\"diliprdeploymentnodeapps\",\"namespace\":\"default\"},\"spec\":{\"replicas\":1,\"selector\":{\"matchLabels\":{\"app\":\"diliprsnodeapppod\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"diliprsnodeapppod\",\"diliprPodLabel1\":\"p1\",\"diliprPodLabel2\":\"p2\"}},\"spec\":{\"containers\":[{\"image\":\"rdilip83/logeverysecond:v2\",\"name\":\"diliprcontainerhelloapp\"}]}}}}\n"
+ }
+ },
+ "spec": {
+ "replicas": 1,
+ "selector": {
+ "matchLabels": {
+ "app": "diliprsnodeapppod"
+ }
+ },
+ "template": {
+ "metadata": {
+ "creationTimestamp": null,
+ "labels": {
+ "app": "diliprsnodeapppod",
+ "diliprPodLabel1": "p1",
+ "diliprPodLabel2": "p2"
+ }
+ },
+ "spec": {
+ "containers": [
+ {
+ "name": "diliprcontainerhelloapp",
+ "image": "rdilip83/logeverysecond:v2",
+ "resources": {},
+ "terminationMessagePath": "/dev/termination-log",
+ "terminationMessagePolicy": "File",
+ "imagePullPolicy": "IfNotPresent"
+ }
+ ],
+ "restartPolicy": "Always",
+ "terminationGracePeriodSeconds": 30,
+ "dnsPolicy": "ClusterFirst",
+ "securityContext": {},
+ "schedulerName": "default-scheduler"
+ }
+ },
+ "strategy": {
+ "type": "RollingUpdate",
+ "rollingUpdate": {
+ "maxUnavailable": "25%",
+ "maxSurge": "25%"
+ }
+ },
+ "revisionHistoryLimit": 2,
+ "progressDeadlineSeconds": 600
+ },
+ "apiVersion": "extensions/v1beta1",
+ "kind": "Deployment"
+ }
+ ]
+ }'
+ nodes = JSON.parse(nodes_json)
+ pods = JSON.parse(pods_json)
+ deployments = JSON.parse(deployments_json)
+ resources = HealthKubernetesResources.instance
+ resources.node_inventory = nodes
+ resources.pod_inventory = pods
+ resources.deployment_inventory = deployments
+ #act
+ parsed_nodes = resources.get_nodes
+ parsed_workloads = resources.get_workload_names
+
+ #assert
+ assert_equal parsed_nodes.size, 2
+ assert_equal parsed_workloads.size, 3
+
+ assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1']
+ assert_equal parsed_workloads, ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy']
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_missing_signal_generator_spec.rb b/test/code/plugin/health/health_missing_signal_generator_spec.rb
new file mode 100644
index 000000000..98d65416d
--- /dev/null
+++ b/test/code/plugin/health/health_missing_signal_generator_spec.rb
@@ -0,0 +1,79 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each {|file| require file}
+include HealthModel
+include Minitest
+
+describe "HealthMissingSignalGenerator spec" do
+ it 'generates missing node signals' do
+ #arrange
+ resources = Mock.new
+ resources.expect(:get_nodes, ["node1"])
+ resources.expect(:get_workload_names, ["default~~workload1"])
+
+ provider = Mock.new
+ provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"])
+
+ node1_cpu_record = Mock.new
+ def node1_cpu_record.monitor_id; "node_cpu_utilization"; end
+ def node1_cpu_record.monitor_instance_id; "node_cpu_utilization"; end
+ def node1_cpu_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ def node1_cpu_record.config; {}; end
+ def node1_cpu_record.state; "pass"; end
+
+ node1_memory_record = Mock.new
+ def node1_memory_record.monitor_id; "node_memory_utilization"; end
+ def node1_memory_record.monitor_instance_id; "node_memory_utilization"; end
+ def node1_memory_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ def node1_memory_record.config; {}; end
+ def node1_memory_record.state; "pass"; end
+
+ node1_condition_record = Mock.new
+ def node1_condition_record.monitor_id; "node_condition"; end
+ def node1_condition_record.monitor_instance_id; "node_condition-0c593682737a955dc8e0947ad12754fe"; end
+ def node1_condition_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ def node1_condition_record.config; {}; end
+ def node1_condition_record.state; "pass"; end
+
+
+ workload1_pods_ready_record = Mock.new
+ def workload1_pods_ready_record.monitor_id; "user_workload_pods_ready"; end
+ def workload1_pods_ready_record.monitor_instance_id; "user_workload_pods_ready-workload1"; end
+ def workload1_pods_ready_record.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end
+ def workload1_pods_ready_record.config; {}; end
+ def workload1_pods_ready_record.state; "pass"; end
+
+ generator = HealthMissingSignalGenerator.new
+ generator.update_last_received_records([node1_cpu_record, node1_memory_record, node1_condition_record, workload1_pods_ready_record])
+
+ #act
+ missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider)
+
+ #assert
+ assert_equal missing.size, 2
+
+ assert_equal missing[0].monitor_id, "node_condition"
+ assert_equal missing[0].state, "unknown"
+ assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe"
+
+ assert_equal missing[1].monitor_id, "user_workload_pods_ready"
+ assert_equal missing[1].state, "unknown"
+ assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1"
+
+ #arrange
+ resources.expect(:get_nodes, ["node1"])
+ resources.expect(:get_workload_names, ["default~~workload1"])
+ provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"])
+ generator.update_last_received_records([node1_cpu_record, node1_memory_record])
+ #act
+ missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider)
+ #assert
+ assert_equal missing.size, 2
+ assert_equal missing[0].monitor_id, "node_condition"
+ assert_equal missing[0].state, "unknown"
+ assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe"
+
+ assert_equal missing[1].monitor_id, "user_workload_pods_ready"
+ assert_equal missing[1].state, "none"
+ assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1"
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_model_buffer_spec.rb b/test/code/plugin/health/health_model_buffer_spec.rb
new file mode 100644
index 000000000..259513c08
--- /dev/null
+++ b/test/code/plugin/health/health_model_buffer_spec.rb
@@ -0,0 +1,25 @@
+require_relative '../../../../source/code/plugin/health/health_model_buffer'
+require_relative '../test_helpers'
+
+include HealthModel
+
+describe "HealthModelBuffer Spec" do
+ it "get_buffer returns the correct buffer data" do
+ # Arrange
+ buffer = HealthModelBuffer.new
+ # Act
+ buffer.add_to_buffer(['mockRecord'])
+ # Assert
+ assert_equal buffer.get_buffer.length, 1
+
+ #Act
+ buffer.add_to_buffer(['mockRecord1', 'mockRecord2'])
+ #Assert
+ assert_equal buffer.get_buffer.length, 3
+
+ #Act
+ buffer.reset_buffer
+ #Assert
+ assert_equal buffer.get_buffer.length, 0
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_model_builder_spec.rb b/test/code/plugin/health/health_model_builder_spec.rb
new file mode 100644
index 000000000..c49e6c92a
--- /dev/null
+++ b/test/code/plugin/health/health_model_builder_spec.rb
@@ -0,0 +1,37 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "HealthModelBuilder spec" do
+ it "Verify hierarchy builder and finalizer public methods are called" do
+ #arrange
+ mock_hierarchy_builder = Mock::new
+ health_record = Mock::new
+ mock_monitor_set = Mock::new
+ mock_state_finalizer = Mock::new
+ mock_hierarchy_builder.expect(:process_record, nil, [health_record, mock_monitor_set])
+ mock_state_finalizer.expect(:finalize, {}, [mock_monitor_set])
+ def mock_monitor_set.get_map; {}; end
+
+ #act
+ builder = HealthModelBuilder.new(mock_hierarchy_builder, [mock_state_finalizer], mock_monitor_set)
+ builder.process_records([health_record])
+ builder.finalize_model
+ #assert
+ assert mock_hierarchy_builder.verify
+ assert mock_state_finalizer.verify
+ end
+
+ it "Verify finalize_model raises if state_finalizers is empty" do
+ #arrange
+ mock_hierarchy_builder = Mock.new
+ mock_monitor_set = Mock.new
+ builder = HealthModelBuilder.new(mock_hierarchy_builder, [], mock_monitor_set)
+ #act and assert
+ assert_raises do
+ builder.finalize_model
+ end
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb
new file mode 100644
index 000000000..df921049c
--- /dev/null
+++ b/test/code/plugin/health/health_model_builder_test.rb
@@ -0,0 +1,337 @@
+require 'test/unit'
+require 'json'
+# require_relative '../../../source/code/plugin/health'
+
+Dir[File.join(__dir__, '../../../../source/code/plugin/health', '*.rb')].each { |file| require file }
+
+class FilterHealthModelBuilderTest < Test::Unit::TestCase
+ include HealthModel
+
+ def test_event_stream
+ #setup
+ health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json')
+ health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file)
+ monitor_factory = MonitorFactory.new
+ hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory)
+ # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side
+ state_finalizers = [AggregateMonitorStateFinalizer.new]
+ monitor_set = MonitorSet.new
+ model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set)
+
+ nodes_file_map = {
+ #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json",
+ "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ }
+
+ pods_file_map = {
+ #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json",
+ "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ }
+
+ cluster_labels = {
+ 'container.azm.ms/cluster-region' => 'eastus',
+ 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a',
+ 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test',
+ 'container.azm.ms/cluster-name' => 'dilipr-health-test'
+ }
+
+ cluster_id = 'fake_cluster_id'
+
+ #test
+ state = HealthMonitorState.new()
+ generator = HealthMissingSignalGenerator.new
+
+ for scenario in ["first", "second", "third"]
+ mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json")
+ file = File.read(mock_data_path)
+ records = JSON.parse(file)
+
+ node_inventory = JSON.parse(File.read(nodes_file_map[scenario]))
+ pod_inventory = JSON.parse(File.read(pods_file_map[scenario]))
+ deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json")))
+ resources = HealthKubernetesResources.instance
+ resources.node_inventory = node_inventory
+ resources.pod_inventory = pod_inventory
+ resources.deployment_inventory = deployment_inventory
+
+ workload_names = resources.get_workload_names
+ provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../..//installer/conf/healthmonitorconfig.json"))
+
+ health_monitor_records = []
+ records.each do |record|
+ monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID]
+ monitor_id = record[HealthMonitorRecordFields::MONITOR_ID]
+ health_monitor_record = HealthMonitorRecord.new(
+ record[HealthMonitorRecordFields::MONITOR_ID],
+ record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID],
+ record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED],
+ record[HealthMonitorRecordFields::DETAILS]["state"],
+ provider.get_labels(record),
+ provider.get_config(monitor_id),
+ record[HealthMonitorRecordFields::DETAILS]
+ )
+
+ state.update_state(health_monitor_record,
+ provider.get_config(health_monitor_record.monitor_id)
+ )
+
+ # get the health state based on the monitor's operational state
+ # update state calls updates the state of the monitor based on configuration and history of the the monitor records
+ health_monitor_record.state = state.get_state(monitor_instance_id).new_state
+ health_monitor_records.push(health_monitor_record)
+ instance_state = state.get_state(monitor_instance_id)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ end
+
+
+ #handle kube api down
+ kube_api_down_handler = HealthKubeApiDownHandler.new
+ health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records)
+
+ # Dedupe daemonset signals
+ # Remove unit monitor signals for “gone” objects
+ reducer = HealthSignalReducer.new()
+ reduced_records = reducer.reduce_signals(health_monitor_records, resources)
+
+ cluster_id = 'fake_cluster_id'
+
+ #get the list of 'none' and 'unknown' signals
+ missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider)
+ #update state for missing signals
+ missing_signals.each{|signal|
+ state.update_state(signal,
+ provider.get_config(signal.monitor_id)
+ )
+ }
+ generator.update_last_received_records(reduced_records)
+ reduced_records.push(*missing_signals)
+
+ # build the health model
+ all_records = reduced_records
+ model_builder.process_records(all_records)
+ all_monitors = model_builder.finalize_model
+
+ # update the state for aggregate monitors (unit monitors are updated above)
+ all_monitors.each{|monitor_instance_id, monitor|
+ if monitor.is_aggregate_monitor
+ state.update_state(monitor,
+ provider.get_config(monitor.monitor_id)
+ )
+ end
+
+ instance_state = state.get_state(monitor_instance_id)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ should_send = instance_state.should_send
+
+ # always send cluster monitor as a heartbeat
+ if !should_send && monitor_instance_id != MonitorId::CLUSTER
+ all_monitors.delete(monitor_instance_id)
+ end
+ }
+
+ records_to_send = []
+ all_monitors.keys.each{|key|
+ record = provider.get_record(all_monitors[key], state)
+ #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}"
+ }
+
+ if scenario == "first"
+ assert_equal 50, all_monitors.size
+ elsif scenario == "second"
+ assert_equal 34, all_monitors.size
+ elsif scenario == "third"
+ assert_equal 5, all_monitors.size
+ end
+ # for each key in monitor.keys,
+ # get the state from health_monitor_state
+ # generate the record to send
+ serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json'))
+ serializer.serialize(state)
+
+ deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json'))
+ deserialized_state = deserializer.deserialize
+
+ after_state = HealthMonitorState.new
+ after_state.initialize_state(deserialized_state)
+ end
+ end
+
+ def test_event_stream_aks_engine
+
+ #setup
+ health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json')
+ health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file)
+ monitor_factory = MonitorFactory.new
+ hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory)
+ state_finalizers = [AggregateMonitorStateFinalizer.new]
+ monitor_set = MonitorSet.new
+ model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set)
+
+ nodes_file_map = {
+ #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json",
+ #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json",
+ "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json",
+ "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json",
+ "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json",
+ }
+
+ pods_file_map = {
+ #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json",
+ #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json",
+ "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json",
+ "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json",
+ "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json",
+ }
+
+ cluster_labels = {
+ 'container.azm.ms/cluster-region' => 'eastus',
+ 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a',
+ 'container.azm.ms/cluster-resource-group' => 'aks-engine-health',
+ 'container.azm.ms/cluster-name' => 'aks-engine-health'
+ }
+
+ cluster_id = 'fake_cluster_id'
+
+ #test
+ state = HealthMonitorState.new()
+ generator = HealthMissingSignalGenerator.new
+
+ for scenario in 1..3
+ mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json")
+ file = File.read(mock_data_path)
+ records = JSON.parse(file)
+
+ node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"]))
+ pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"]))
+ deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json")))
+ resources = HealthKubernetesResources.instance
+ resources.node_inventory = node_inventory
+ resources.pod_inventory = pod_inventory
+ resources.deployment_inventory = deployment_inventory
+
+ workload_names = resources.get_workload_names
+ provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json"))
+
+ health_monitor_records = []
+ records.each do |record|
+ monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID]
+ monitor_id = record[HealthMonitorRecordFields::MONITOR_ID]
+ health_monitor_record = HealthMonitorRecord.new(
+ record[HealthMonitorRecordFields::MONITOR_ID],
+ record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID],
+ record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED],
+ record[HealthMonitorRecordFields::DETAILS]["state"],
+ provider.get_labels(record),
+ provider.get_config(monitor_id),
+ record[HealthMonitorRecordFields::DETAILS]
+ )
+
+ state.update_state(health_monitor_record,
+ provider.get_config(health_monitor_record.monitor_id)
+ )
+
+ # get the health state based on the monitor's operational state
+ # update state calls updates the state of the monitor based on configuration and history of the the monitor records
+ health_monitor_record.state = state.get_state(monitor_instance_id).new_state
+ health_monitor_records.push(health_monitor_record)
+ instance_state = state.get_state(monitor_instance_id)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ end
+
+
+ #handle kube api down
+ kube_api_down_handler = HealthKubeApiDownHandler.new
+ health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records)
+
+ # Dedupe daemonset signals
+ # Remove unit monitor signals for “gone” objects
+ reducer = HealthSignalReducer.new()
+ reduced_records = reducer.reduce_signals(health_monitor_records, resources)
+
+ cluster_id = 'fake_cluster_id'
+
+ #get the list of 'none' and 'unknown' signals
+ missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider)
+ #update state for missing signals
+ missing_signals.each{|signal|
+ state.update_state(signal,
+ provider.get_config(signal.monitor_id)
+ )
+ }
+ generator.update_last_received_records(reduced_records)
+ reduced_records.push(*missing_signals)
+
+ # build the health model
+ all_records = reduced_records
+ model_builder.process_records(all_records)
+ all_monitors = model_builder.finalize_model
+
+ # update the state for aggregate monitors (unit monitors are updated above)
+ all_monitors.each{|monitor_instance_id, monitor|
+ if monitor.is_aggregate_monitor
+ state.update_state(monitor,
+ provider.get_config(monitor.monitor_id)
+ )
+ end
+
+ instance_state = state.get_state(monitor_instance_id)
+ #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}"
+ should_send = instance_state.should_send
+
+ # always send cluster monitor as a heartbeat
+ if !should_send && monitor_instance_id != MonitorId::CLUSTER
+ all_monitors.delete(monitor_instance_id)
+ end
+ }
+
+ records_to_send = []
+ all_monitors.keys.each{|key|
+ record = provider.get_record(all_monitors[key], state)
+ #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}"
+ }
+
+ if scenario == 1
+ assert_equal 58, all_monitors.size
+ elsif scenario == 2
+ assert_equal 37, all_monitors.size
+ elsif scenario == 3
+ assert_equal 6, all_monitors.size
+ end
+ # for each key in monitor.keys,
+ # get the state from health_monitor_state
+ # generate the record to send
+ serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json'))
+ serializer.serialize(state)
+
+ deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json'))
+ deserialized_state = deserializer.deserialize
+
+ after_state = HealthMonitorState.new
+ after_state.initialize_state(deserialized_state)
+ end
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_model_definition_parser_spec.rb b/test/code/plugin/health/health_model_definition_parser_spec.rb
new file mode 100644
index 000000000..56551510b
--- /dev/null
+++ b/test/code/plugin/health/health_model_definition_parser_spec.rb
@@ -0,0 +1,24 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "HealthModelDefinitionParser spec " do
+ it "parses the definition file correctly with the right conditions" do
+ #arrange
+
+ parser = HealthModelDefinitionParser.new(File.join(File.expand_path(File.dirname(__FILE__)), 'test_health_model_definition.json'))
+ #act
+ model_definition = parser.parse_file
+
+ #assert
+ assert_equal model_definition['conditional_monitor_id'].key?("conditions"), true
+ assert_equal model_definition['conditional_monitor_id']["conditions"].size, 2
+ assert_equal model_definition['conditional_monitor_id'].key?("parent_monitor_id"), false
+
+ #assert
+ assert_equal model_definition['monitor_id'].key?("conditions"), false
+ assert_equal model_definition['monitor_id'].key?("parent_monitor_id"), true
+ end
+
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_monitor_state_spec.rb b/test/code/plugin/health/health_monitor_state_spec.rb
new file mode 100644
index 000000000..5fa8a6c6e
--- /dev/null
+++ b/test/code/plugin/health/health_monitor_state_spec.rb
@@ -0,0 +1,176 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "HealthMonitorState spec" do
+ it 'updates should_send to true for monitors which hasnt been sent before' do
+ #arrange
+ state = HealthMonitorState.new
+ mock_monitor = Mock.new
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.monitor_id; "monitor_id"; end
+ def mock_monitor.monitor_instance_id; "monitor_instance_id"; end
+ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+
+ #act
+ state.update_state(mock_monitor, {})
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "pass"
+ end
+
+ it 'updates should_send to true for monitors which need no consistent state change' do
+ #arrange
+ state = HealthMonitorState.new
+ mock_monitor = Mock.new
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.monitor_id; "monitor_id"; end
+ def mock_monitor.monitor_instance_id; "monitor_instance_id"; end
+ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+
+ #act
+ state.update_state(mock_monitor, {})
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "pass"
+
+ #arrange
+ def mock_monitor.state; "fail"; end
+ def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+ #act
+ state.update_state(mock_monitor, {})
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "pass"
+ monitor_state.new_state.must_equal "fail"
+ end
+
+ it 'updates should_send to false for monitors which need consistent state change and has no consistent state change' do
+ #arrange
+ state = HealthMonitorState.new
+ mock_monitor = Mock.new
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.monitor_id; "monitor_id"; end
+ def mock_monitor.monitor_instance_id; "monitor_instance_id"; end
+ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+
+ config = JSON.parse('{
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ }')
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+
+ #arrange
+ def mock_monitor.state; "fail"; end
+ def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal false
+ end
+
+ it 'updates should_send to true for monitors which need consistent state change and has a consistent state change' do
+ #arrange
+ state = HealthMonitorState.new
+ mock_monitor = Mock.new
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.monitor_id; "monitor_id"; end
+ def mock_monitor.monitor_instance_id; "monitor_instance_id"; end
+ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+
+ config = JSON.parse('{
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ }')
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+
+ #arrange
+ def mock_monitor.state; "fail"; end
+ def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal false
+
+ #act
+ state.update_state(mock_monitor, config)
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "fail"
+ end
+
+ it 'updates should_send to false for monitors which need consistent state change and has NO state change' do
+ #arrange
+ state = HealthMonitorState.new
+ mock_monitor = Mock.new
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.monitor_id; "monitor_id"; end
+ def mock_monitor.monitor_instance_id; "monitor_instance_id"; end
+ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+
+ config = JSON.parse('{
+ "WarnThresholdPercentage": 80.0,
+ "FailThresholdPercentage": 90.0,
+ "ConsecutiveSamplesForStateTransition": 3
+ }')
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "none"
+
+
+ #arrange
+ def mock_monitor.state; "pass"; end
+ def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal false
+
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state.should_send.must_equal true
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "pass"
+
+ #act
+ state.update_state(mock_monitor, config)
+ monitor_state = state.get_state("monitor_instance_id")
+ #assert
+ monitor_state.should_send.must_equal false
+ monitor_state.old_state.must_equal "none"
+ monitor_state.new_state.must_equal "pass"
+ end
+
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/health_signal_reducer_spec.rb b/test/code/plugin/health/health_signal_reducer_spec.rb
new file mode 100644
index 000000000..f71a5c509
--- /dev/null
+++ b/test/code/plugin/health/health_signal_reducer_spec.rb
@@ -0,0 +1,96 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "HealthSignalReducer spec" do
+ it "returns the right set of records -- no reduction" do
+ #arrange
+ record1 = Mock.new
+ def record1.monitor_id; "node_cpu_utilization"; end
+ def record1.monitor_instance_id; "node_cpu_utilization-node1"; end
+ def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ inventory = Mock.new
+ def inventory.get_nodes; ["node1"]; end
+ def inventory.get_workload_names; []; end
+ reducer = HealthSignalReducer.new
+ #act
+ reduced = reducer.reduce_signals([record1], inventory)
+ #Assert
+ assert_equal reduced.size, 1
+ end
+
+ it "returns only the latest record if multiple records are present for the same monitor" do
+ #arrange
+ record1 = Mock.new
+ def record1.monitor_id; "node_cpu_utilization"; end
+ def record1.monitor_instance_id; "node_cpu_utilization-node1"; end
+ def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ def record1.transition_date_time; Time.now.utc.iso8601 ; end
+
+
+ record2 = Mock.new
+ def record2.monitor_id; "node_cpu_utilization"; end
+ def record2.monitor_instance_id; "node_cpu_utilization-node1"; end
+ def record2.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ def record2.transition_date_time; "#{Time.now.utc.iso8601}" ; end
+
+ inventory = Mock.new
+ def inventory.get_nodes; ["node1"]; end
+ def inventory.get_workload_names; []; end
+ reducer = HealthSignalReducer.new
+ #act
+ reduced = reducer.reduce_signals([record1, record2], inventory)
+ #Assert
+ assert_equal reduced.size, 1
+ end
+
+ it "returns only those records if the node is present in the inventory" do
+ #arrange
+ record1 = Mock.new
+ def record1.monitor_id; "node_cpu_utilization"; end
+ def record1.monitor_instance_id; "node_cpu_utilization-node1"; end
+ def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end
+ inventory = Mock.new
+ def inventory.get_nodes; ["node2"]; end
+ def inventory.get_workload_names; []; end
+
+ #act
+ reducer = HealthSignalReducer.new
+ #assert
+ assert_equal reducer.reduce_signals([record1], inventory).size, 0
+ end
+
+ it "returns only those records if the workdload name is present in the inventory" do
+ #arrange
+ record1 = Mock.new
+ def record1.monitor_id; "user_workload_pods_ready"; end
+ def record1.monitor_instance_id; "user_workload_pods_ready-workload1"; end
+ def record1.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end
+ def record1.transition_date_time; Time.now.utc.iso8601 ; end
+
+ inventory = Mock.new
+ def inventory.get_nodes; ["node2"]; end
+ def inventory.get_workload_names; ["default~~workload1"]; end
+ reducer = HealthSignalReducer.new
+
+ #act
+ reduced = reducer.reduce_signals([record1], inventory)
+
+ #assert
+ assert_equal reduced.size, 1
+
+ #arrange
+ record2 = Mock.new
+ def record2.monitor_id; "user_workload_pods_ready"; end
+ def record2.monitor_instance_id; "user_workload_pods_ready-workload2"; end
+ def record2.labels; {HealthMonitorLabels::NAMESPACE => "default1", HealthMonitorLabels::WORKLOAD_NAME => "workload2"}; end
+ def record1.transition_date_time; Time.now.utc.iso8601 ; end
+ #act
+ reduced = reducer.reduce_signals([record1, record2], inventory)
+ #assert
+ assert_equal reduced.size, 1
+ end
+
+end
diff --git a/test/code/plugin/health/kube_api_down_handler_spec.rb b/test/code/plugin/health/kube_api_down_handler_spec.rb
new file mode 100644
index 000000000..3f3f9b37f
--- /dev/null
+++ b/test/code/plugin/health/kube_api_down_handler_spec.rb
@@ -0,0 +1,26 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "KubeApiDownHandler spec" do
+ it "updates states for monitors in monitors_to_change" do
+ #arrange
+ record1 = HealthMonitorRecord.new("node_condition", "node_condition-node1", Time.now.utc.iso8601, "pass", {}, {}, {})
+ record2 = HealthMonitorRecord.new("kube_api_status", "kube_api_status", Time.now.utc.iso8601, "fail", {}, {}, {})
+ record3 = HealthMonitorRecord.new("user_workload_pods_ready", "user_workload_pods_ready-workload1", Time.now.utc.iso8601, "pass", {}, {}, {})
+ record4 = HealthMonitorRecord.new("system_workload_pods_ready", "system_workload_pods_ready-workload2", Time.now.utc.iso8601, "pass", {}, {}, {})
+ record5 = HealthMonitorRecord.new("subscribed_capacity_cpu", "subscribed_capacity_cpu", Time.now.utc.iso8601, "pass", {}, {}, {})
+ record6 = HealthMonitorRecord.new("subscribed_capacity_memory", "subscribed_capacity_memory", Time.now.utc.iso8601, "pass", {}, {}, {})
+ handler = HealthKubeApiDownHandler.new
+
+ #act
+ handler.handle_kube_api_down([record1, record2, record3, record4, record5, record6])
+ #assert
+ assert_equal record1.state, HealthMonitorStates::UNKNOWN
+ assert_equal record3.state, HealthMonitorStates::UNKNOWN
+ assert_equal record4.state, HealthMonitorStates::UNKNOWN
+ assert_equal record5.state, HealthMonitorStates::UNKNOWN
+ assert_equal record6.state, HealthMonitorStates::UNKNOWN
+
+ end
+end
diff --git a/test/code/plugin/health/monitor_factory_spec.rb b/test/code/plugin/health/monitor_factory_spec.rb
new file mode 100644
index 000000000..2135808bd
--- /dev/null
+++ b/test/code/plugin/health/monitor_factory_spec.rb
@@ -0,0 +1,28 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "MonitorFactory Spec" do
+ it "returns UnitMonitor for create_unit_monitor" do
+ #Arrange
+ factory = MonitorFactory.new()
+ monitor_record = HealthMonitorRecord.new(:monitor_id, :monitor_instance_id, :time, :pass, {}, {}, {})
+ #act
+ monitor = factory.create_unit_monitor(monitor_record)
+ # assert
+ monitor.must_be_kind_of(UnitMonitor)
+ end
+
+ it "returns AggregateMonitor for create_aggregate_monitor" do
+ #arrange
+ factory = MonitorFactory.new()
+ mock = Minitest::Mock.new
+ def mock.state; :pass; end
+ def mock.transition_date_time; :time; end
+ #act
+ monitor = factory.create_aggregate_monitor(:monitor_id, :monitor_instance_id, :pass, {}, {}, mock)
+ #assert
+ monitor.must_be_kind_of(AggregateMonitor)
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/monitor_set_spec.rb b/test/code/plugin/health/monitor_set_spec.rb
new file mode 100644
index 000000000..1f4e970be
--- /dev/null
+++ b/test/code/plugin/health/monitor_set_spec.rb
@@ -0,0 +1,58 @@
+require_relative '../test_helpers'
+# consider doing this in test_helpers.rb so that this code is common
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+
+describe "MonitorSet Spec" do
+ it "add_or_update -- adds a monitor" do
+ #arrange
+ set = MonitorSet.new
+ mock_monitor = MiniTest::Mock.new
+ def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end
+ def mock_monitor.state; :pass;end
+ #act
+ set.add_or_update(mock_monitor)
+ #assert
+ assert_equal set.get_map.size, 1
+ assert_equal set.get_map.key?("monitor_instance_id_1"), true
+ end
+
+ it "add_or_update -- updates a monitor" do
+ #arrange
+ set = MonitorSet.new
+ mock_monitor = MiniTest::Mock.new
+ def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end
+ def mock_monitor.state; :pass;end
+ #act
+ set.add_or_update(mock_monitor)
+ #assert
+ assert_equal set.get_map["monitor_instance_id_1"].state, :pass
+
+ #act
+ def mock_monitor.state; :fail;end
+ set.add_or_update(mock_monitor)
+ #assert
+ assert_equal set.get_map["monitor_instance_id_1"].state, :fail
+ end
+
+ it "delete -- delete a monitor" do
+ #arrange
+ set = MonitorSet.new
+ mock_monitor = MiniTest::Mock.new
+ def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end
+ def mock_monitor.state; :pass;end
+ set.add_or_update(mock_monitor)
+
+ #act
+ set.delete("monitor_instance_id_1")
+ #assert
+ assert_equal set.get_map.size, 0
+ end
+
+ it "get_map -- returns a hash" do
+ #arrange
+ set = MonitorSet.new
+ #act and assert
+ set.get_map.must_be_kind_of(Hash)
+ end
+end
diff --git a/test/code/plugin/health/parent_monitor_provider_spec.rb b/test/code/plugin/health/parent_monitor_provider_spec.rb
new file mode 100644
index 000000000..a83db50fc
--- /dev/null
+++ b/test/code/plugin/health/parent_monitor_provider_spec.rb
@@ -0,0 +1,144 @@
+require_relative '../test_helpers'
+Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file }
+include HealthModel
+include Minitest
+
+describe "ParentMonitorProvider spec" do
+ it 'returns correct parent_monitor_id for a non-condition case' do
+ #arrange
+ definition = JSON.parse('{
+ "monitor_id" : {
+ "parent_monitor_id": "parent_monitor_id",
+ "labels": [
+ "label_1",
+ "label_2"
+ ]
+ }
+ }'
+ )
+ health_model_definition = ParentMonitorProvider.new(definition)
+
+ monitor = Mock.new
+ def monitor.monitor_id; "monitor_id"; end
+ def monitor.monitor_instance_id; "monitor_instance_id"; end
+
+ #act
+ parent_id = health_model_definition.get_parent_monitor_id(monitor)
+ #assert
+ assert_equal parent_id, "parent_monitor_id"
+ end
+
+ it 'returns raises for an incorrect monitor id' do
+ #arrange
+ definition = JSON.parse('{
+ "monitor_id" : {
+ "parent_monitor_id": "parent_monitor_id",
+ "labels": [
+ "label_1",
+ "label_2"
+ ]
+ }
+ }'
+ )
+ health_model_definition = ParentMonitorProvider.new(definition)
+
+ monitor = Mock.new
+ def monitor.monitor_id; "monitor_id_!"; end
+ def monitor.monitor_instance_id; "monitor_instance_id"; end
+
+ #act and assert
+ assert_raises do
+ parent_id = health_model_definition.get_parent_monitor_id(monitor)
+ end
+ end
+
+ it 'returns correct parent_monitor_id for a conditional case' do
+ #arrange
+ definition = JSON.parse('{"conditional_monitor_id": {
+ "conditions": [
+ {
+ "key": "kubernetes.io/role",
+ "operator": "==",
+ "value": "master",
+ "parent_id": "master_node_pool"
+ },
+ {
+ "key": "kubernetes.io/role",
+ "operator": "==",
+ "value": "agent",
+ "parent_id": "agent_node_pool"
+ }
+ ],
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "aggregation_algorithm": "worstOf",
+ "aggregation_algorithm_params": null
+ }
+
+ }'
+ )
+ health_model_definition = ParentMonitorProvider.new(definition)
+
+ monitor = Mock.new
+ def monitor.monitor_id; "conditional_monitor_id"; end
+ def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end
+ def monitor.labels; {HealthMonitorLabels::ROLE => "master"}; end
+
+ #act
+ parent_id = health_model_definition.get_parent_monitor_id(monitor)
+ #assert
+ assert_equal parent_id, "master_node_pool"
+ end
+
+ it 'raises if conditions are not met' do
+ #arrange
+ definition = JSON.parse('{"conditional_monitor_id": {
+ "conditions": [
+ {
+ "key": "kubernetes.io/role",
+ "operator": "==",
+ "value": "master",
+ "parent_id": "master_node_pool"
+ },
+ {
+ "key": "kubernetes.io/role",
+ "operator": "==",
+ "value": "agent",
+ "parent_id": "agent_node_pool"
+ }
+ ],
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "aggregation_algorithm": "worstOf",
+ "aggregation_algorithm_params": null
+ }
+
+ }'
+ )
+ health_model_definition = ParentMonitorProvider.new(definition)
+
+ monitor = Mock.new
+ def monitor.monitor_id; "conditional_monitor_id"; end
+ def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end
+ def monitor.labels; {HealthMonitorLabels::ROLE => "master1"}; end
+
+ #act and assert
+ assert_raises do
+ parent_id = health_model_definition.get_parent_monitor_id(monitor)
+ end
+ end
+end
diff --git a/test/code/plugin/health/test_health_model_definition.json b/test/code/plugin/health/test_health_model_definition.json
new file mode 100644
index 000000000..31d219705
--- /dev/null
+++ b/test/code/plugin/health/test_health_model_definition.json
@@ -0,0 +1,42 @@
+[
+ {
+ "monitor_id": "monitor_id",
+ "parent_monitor_id": "parent_monitor_id",
+ "labels": [
+ "container.azm.ms/namespace",
+ "container.azm.ms/workload-name",
+ "container.azm.ms/workload-kind",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ]
+ },
+ {
+ "monitor_id": "conditional_monitor_id",
+ "aggregation_algorithm": "worstOf",
+ "labels": [
+ "kubernetes.io/hostname",
+ "agentpool",
+ "kubernetes.io/role",
+ "container.azm.ms/cluster-region",
+ "container.azm.ms/cluster-subscription-id",
+ "container.azm.ms/cluster-resource-group",
+ "container.azm.ms/cluster-name"
+ ],
+ "parent_monitor_id": [
+ {
+ "label": "kubernetes.io/role",
+ "operator": "==",
+ "value": "master",
+ "id": "master_node_pool"
+ },
+ {
+ "label": "kubernetes.io/role",
+ "operator": "==",
+ "value": "agent",
+ "id": "agent_node_pool"
+ }
+ ]
+ }
+]
\ No newline at end of file
diff --git a/test/code/plugin/health/unit_monitor_spec.rb b/test/code/plugin/health/unit_monitor_spec.rb
new file mode 100644
index 000000000..4cbf794db
--- /dev/null
+++ b/test/code/plugin/health/unit_monitor_spec.rb
@@ -0,0 +1,20 @@
+require_relative '../../../../source/code/plugin/health/unit_monitor'
+require_relative '../test_helpers'
+
+include HealthModel
+
+describe "UnitMonitor Spec" do
+ it "is_aggregate_monitor is false for UnitMonitor" do
+ # Arrange/Act
+ monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {})
+ # Assert
+ assert_equal monitor.is_aggregate_monitor, false
+ end
+
+ it "get_member_monitors is nil for UnitMonitor" do
+ # Arrange/Act
+ monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {})
+ #Assert
+ assert_nil monitor.get_member_monitors
+ end
+end
\ No newline at end of file
diff --git a/test/code/plugin/health/unit_monitor_test.rb b/test/code/plugin/health/unit_monitor_test.rb
new file mode 100644
index 000000000..e53617c99
--- /dev/null
+++ b/test/code/plugin/health/unit_monitor_test.rb
@@ -0,0 +1,16 @@
+require_relative '../../../../source/code/plugin/health/unit_monitor'
+require_relative '../test_helpers'
+
+class UnitMonitorTest < Minitest::Test
+ include HealthModel
+
+ def test_is_aggregate_monitor_false
+ monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {})
+ assert_equal monitor.is_aggregate_monitor, false
+ end
+
+ def test_get_member_monitors_nil
+ monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {})
+ assert_nil monitor.get_member_monitors
+ end
+end
diff --git a/test/code/plugin/test_helpers.rb b/test/code/plugin/test_helpers.rb
new file mode 100644
index 000000000..543f00ac9
--- /dev/null
+++ b/test/code/plugin/test_helpers.rb
@@ -0,0 +1,3 @@
+gem "minitest"
+require "minitest/spec"
+require 'minitest/autorun'
\ No newline at end of file