diff --git a/azure-mdsd_1.8.0-build.develop.1821_x86_64.deb b/azure-mdsd_1.8.0-build.develop.1821_x86_64.deb
new file mode 100755
index 000000000..265fadc10
Binary files /dev/null and b/azure-mdsd_1.8.0-build.develop.1821_x86_64.deb differ
diff --git a/build/linux/installer/conf/fluentd4.conf b/build/linux/installer/conf/fluentd4.conf
new file mode 100644
index 000000000..aecd1e31a
--- /dev/null
+++ b/build/linux/installer/conf/fluentd4.conf
@@ -0,0 +1,94 @@
+ #Kubernetes pod inventory
+
+ @type kube_podinventory
+ tag oms.containerinsights.KubePodInventory
+ run_interval 60
+ @log_level debug
+
+
+
+ @type forward
+ @log_level debug
+
+ host 0.0.0.0
+ port 29230
+
+ keepalive true
+
+ @type msgpack
+
+
+
+
+ @type forward
+ @log_level debug
+
+ host 0.0.0.0
+ port 29230
+
+ keepalive true
+
+ @type msgpack
+
+
+
+
+ @type mdm
+ @log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_mdm_*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+ retry_mdm_post_wait_minutes 30
+
+
+
+ @type oms
+ @log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ @type oms
+ @log_level debug
+ num_threads 2
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ @type oms
+ @log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
\ No newline at end of file
diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index fb566c360..3ba6e38dd 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -7,14 +7,6 @@
chunk_size_limit 4m
- #Kubernetes pod inventory
-
- type kubepodinventory
- tag oms.containerinsights.KubePodInventory
- run_interval 60
- log_level debug
-
-
#Kubernetes Persistent Volume inventory
type kubepvinventory
@@ -88,21 +80,6 @@
type filter_health_model_builder
-
- type out_oms
- log_level debug
- num_threads 5
- buffer_chunk_limit 4m
- buffer_type file
- buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer
- buffer_queue_limit 20
- buffer_queue_full_action drop_oldest_chunk
- flush_interval 20s
- retry_limit 10
- retry_wait 5s
- max_retry_wait 5m
-
-
type out_oms
log_level debug
@@ -133,21 +110,6 @@
max_retry_wait 5m
-
- type out_oms
- log_level debug
- num_threads 2
- buffer_chunk_limit 4m
- buffer_type file
- buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer
- buffer_queue_limit 20
- buffer_queue_full_action drop_oldest_chunk
- flush_interval 20s
- retry_limit 10
- retry_wait 5s
- max_retry_wait 5m
-
-
type out_oms
log_level debug
@@ -192,7 +154,7 @@
max_retry_wait 5m
-
+
type out_mdm
log_level debug
num_threads 5
diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf
index d81196330..3f1a5b6f7 100644
--- a/build/linux/installer/conf/telegraf-rs.conf
+++ b/build/linux/installer/conf/telegraf-rs.conf
@@ -653,3 +653,36 @@ $AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER
#[inputs.prometheus.tagpass]
# operation_type = ["create_container", "remove_container", "pull_image"]
+
+
+[[inputs.procstat]]
+ #name_prefix="container.azm.ms/"
+ exe = "ruby"
+ interval = "2s"
+ pid_finder = "native"
+ pid_tag = true
+ name_override = "agent_telemetry"
+ fieldpass = ["cpu_usage", "memory_rss"]
+ [inputs.procstat.tags]
+ Computer = "$NODE_NAME"
+ AgentVersion = "$AGENT_VERSION"
+ ControllerType = "$CONTROLLER_TYPE"
+ AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID"
+ ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME"
+ Region = "$TELEMETRY_AKS_REGION"
+
+[[inputs.procstat]]
+ #name_prefix="container.azm.ms/"
+ exe = "mdsd"
+ interval = "2s"
+ pid_finder = "native"
+ pid_tag = true
+ name_override = "agent_telemetry"
+ fieldpass = ["cpu_usage", "memory_rss"]
+ [inputs.procstat.tags]
+ Computer = "$NODE_NAME"
+ AgentVersion = "$AGENT_VERSION"
+ ControllerType = "$CONTROLLER_TYPE"
+ AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID"
+ ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME"
+ Region = "$TELEMETRY_AKS_REGION"
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index c680f0eea..43bdef908 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -27,20 +27,21 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root
/etc/opt/microsoft/docker-cimprov/container.conf; build/linux/installer/conf/container.conf; 644; root; root
-
-/opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root
-/opt/microsoft/omsagent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root
+/opt/fluent/fluentd4.conf; build/linux/installer/conf/fluentd4.conf; 644; root; root
+
+/opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root
+/opt/microsoft/omsagent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root
/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby/ApplicationInsightsUtility.rb; 644; root; root
@@ -165,6 +166,101 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/plugins/ruby/health/monitor_set.rb; 644; root; root
/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/plugins/ruby/health/unit_monitor.rb; 644; root; root
+
+
+
+
+
+
+
+
+
+
+
+
+
+/etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby-fluentd4/in_kube_podinventory.rb; 644; root; root
+/etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby-fluentd4/KubernetesApiClient.rb; 644; root; root
+/etc/fluent/plugin/out_mdm.rb; source/plugins/ruby-fluentd4/out_mdm.rb; 644; root; root
+/etc/fluent/plugin/CustomMetricsUtils.rb; source/plugins/ruby-fluentd4/CustomMetricsUtils.rb; 644; root; root
+/etc/fluent/plugin/constants.rb; source/plugins/ruby-fluentd4/constants.rb; 644; root; root
+/etc/fluent/plugin/MdmAlertTemplates.rb; source/plugins/ruby-fluentd4/MdmAlertTemplates.rb; 644; root; root
+/etc/fluent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby-fluentd4/MdmMetricsGenerator.rb; 644; root; root
+/etc/fluent/plugin/podinventory_to_mdm.rb; source/plugins/ruby-fluentd4/podinventory_to_mdm.rb; 644; root; root
+/etc/fluent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby-fluentd4/arc_k8s_cluster_identity.rb; 644; root; root
+
+
+/etc/fluent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby-fluentd4/ApplicationInsightsUtility.rb; 644; root; root
+/etc/fluent/plugin/DockerApiClient.rb; source/plugins/ruby-fluentd4/DockerApiClient.rb; 644; root; root
+/etc/fluent/plugin/DockerApiRestHelper.rb; source/plugins/ruby-fluentd4/DockerApiRestHelper.rb; 644; root; root
+/etc/fluent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby-fluentd4/kubernetes_container_inventory.rb; 644; root; root
+/etc/fluent/plugin/proxy_utils.rb; source/plugins/ruby-fluentd4/proxy_utils.rb; 644; root; root
+
+
+/etc/fluent/plugin/lib/application_insights/version.rb; source/plugins/ruby-fluentd4/lib/application_insights/version.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/rack/track_request.rb; source/plugins/ruby-fluentd4/lib/application_insights/rack/track_request.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/unhandled_exception.rb; source/plugins/ruby-fluentd4/lib/application_insights/unhandled_exception.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/telemetry_client.rb; source/plugins/ruby-fluentd4/lib/application_insights/telemetry_client.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/queue_base.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/queue_base.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_sender.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/request_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/session.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/session.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/location.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/location.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/operation.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/operation.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/event_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/device.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/device.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/message_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/user.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/user.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/application.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/application.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/cloud.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/envelope.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/domain.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/domain.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/base.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/base.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/internal.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/internal.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_queue.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/sender_base.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/sender_base.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/telemetry_context.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_context.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_channel.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights/channel/event.rb; source/plugins/ruby-fluentd4/lib/application_insights/channel/event.rb; 644; root; root
+/etc/fluent/plugin/lib/application_insights.rb; source/plugins/ruby-fluentd4/lib/application_insights.rb; 644; root; root
+
+
+/etc/fluent/plugin/oms_common.rb; source/plugins/ruby-fluentd4/oms_common.rb; 644; root; root
+/etc/fluent/plugin/oms_configuration.rb; source/plugins/ruby-fluentd4/oms_configuration.rb; 644; root; root
+/etc/fluent/plugin/oms_omi_lib.rb; source/plugins/ruby-fluentd4/oms_omi_lib.rb; 644; root; root
+/etc/fluent/plugin/omslog.rb; source/plugins/ruby-fluentd4/omslog.rb; 644; root; root
+/etc/fluent/plugin/out_oms.rb; source/plugins/ruby-fluentd4/out_oms.rb; 644; root; root
+/etc/fluent/plugin/agent_telemetry_script.rb; source/plugins/ruby-fluentd4/agent_telemetry_script.rb; 644; root; root
+/etc/fluent/plugin/agent_common.rb; source/plugins/ruby-fluentd4/agent_common.rb; 644; root; root
+/etc/fluent/plugin/agent_maintenance_script.rb; source/plugins/ruby-fluentd4/agent_maintenance_script.rb; 644; root; root
+/etc/fluent/plugin/agent_topology_request_script.rb; source/plugins/ruby-fluentd4/agent_topology_request_script.rb; 644; root; root
+
+
+
+
+
+
+
+
+
%Links
/opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root
@@ -197,6 +293,8 @@ MAINTAINER: 'Microsoft Corporation'
/opt/omi; 755; root; root; sysdir
/opt/omi/lib; 755; root; root; sysdir
+/opt/fluent; 755; root; root; sysdir
+
/var/opt/microsoft; 755; root; root; sysdir
/var/opt/microsoft/docker-cimprov; 755; root; root
/var/opt/microsoft/docker-cimprov/state; 755; root; root
@@ -215,6 +313,15 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir
/opt/microsoft/omsagent/plugin/lib/application_insights/rack; 755; root; root; sysdir
+/etc/fluent; 755; root; root; sysdir
+/etc/fluent/plugin; 755; root; root; sysdir
+/etc/fluent/plugin/health; 755; root; root; sysdir
+/etc/fluent/plugin/lib; 755; root; root; sysdir
+/etc/fluent/plugin/lib/application_insights; 755; root; root; sysdir
+/etc/fluent/plugin/lib/application_insights/channel; 755; root; root; sysdir
+/etc/fluent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir
+/etc/fluent/plugin/lib/application_insights/rack; 755; root; root; sysdir
+
/opt/tomlrb; 755; root; root; sysdir
%Dependencies
diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh
index e3f9fb475..7880370bc 100644
--- a/build/linux/installer/scripts/livenessprobe.sh
+++ b/build/linux/installer/scripts/livenessprobe.sh
@@ -26,6 +26,14 @@ then
exit 1
fi
+#test to exit non zero value if fluentd is not running
+(ps -ef | grep fluentd | grep -v "grep")
+if [ $? -ne 0 ]
+then
+ echo "Fluentd is not running" > /dev/termination-log
+ exit 1
+fi
+
if [ ! -s "inotifyoutput.txt" ]
then
# inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded
diff --git a/charts/azuremonitor-containers-2.8.0.tgz b/charts/azuremonitor-containers-2.8.0.tgz
new file mode 100644
index 000000000..979ff1dc0
Binary files /dev/null and b/charts/azuremonitor-containers-2.8.0.tgz differ
diff --git a/charts/azuremonitor-containers-highscale-private-preview-2.8.0.tgz b/charts/azuremonitor-containers-highscale-private-preview-2.8.0.tgz
new file mode 100644
index 000000000..bba98f7a9
Binary files /dev/null and b/charts/azuremonitor-containers-highscale-private-preview-2.8.0.tgz differ
diff --git a/charts/azuremonitor-containers-highscale-private-preview/.helmignore b/charts/azuremonitor-containers-highscale-private-preview/.helmignore
new file mode 100644
index 000000000..f0c131944
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/.helmignore
@@ -0,0 +1,21 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
diff --git a/charts/azuremonitor-containers-highscale-private-preview/Chart.yaml b/charts/azuremonitor-containers-highscale-private-preview/Chart.yaml
new file mode 100644
index 000000000..c1d10c1a2
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/Chart.yaml
@@ -0,0 +1,38 @@
+apiVersion: v1
+appVersion: 7.0.0-1
+description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes
+name: azuremonitor-containers-highscale-private-preview
+version: 2.8.0
+kubeVersion: "^1.10.0-0"
+keywords:
+ - monitoring
+ - azuremonitor
+ - azure
+ - oms
+ - containerinsights
+ - metric
+ - event
+ - logs
+ - containerhealth
+ - kubernetesmonitoring
+ - acs-engine
+ - aks-engine
+ - azurestack
+ - openshift v4
+ - azure redhat openshift v4
+ - on-prem kubernetes monitoring
+ - containerlogs
+ - containerhealth
+ - containermonitoring
+ - hybrid kubernetes monitoring
+ - kubernetes
+ - kuberneteshealth
+home: https://docs.microsoft.com/en-us/azure/monitoring/monitoring-container-health
+icon: https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/img/azuremonitor-containers.svg
+sources:
+ - https://github.com/microsoft/Docker-Provider/tree/ci_prod
+maintainers:
+ - name: vishiy
+ email: visnara@microsoft.com
+ - name: ganga1980
+ email: gangams@microsoft.com
diff --git a/charts/azuremonitor-containers-highscale-private-preview/README.md b/charts/azuremonitor-containers-highscale-private-preview/README.md
new file mode 100644
index 000000000..fccbe0c97
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/README.md
@@ -0,0 +1,178 @@
+# Azure Monitor – Containers
+
+---
+
+## Introduction
+
+This article describes how to set up and use [Azure Monitor - Containers](https://docs.microsoft.com/en-us/azure/monitoring/monitoring-container-health) to monitor the health and performance of your workloads deployed to Kubernetes and OpenShift v4 environments.
+
+Monitoring your Kubernetes cluster and containers is critical, especially when running a production cluster, at scale, with multiple applications.
+
+---
+
+## Pre-requisites
+
+- [Kubernetes versions and support policy same as AKS supported versions](https://docs.microsoft.com/en-us/azure/aks/supported-kubernetes-versions)
+
+- You will need to create a location to store your monitoring data.
+
+1. [Create Azure Log Analytics Workspace](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-quick-create-workspace)
+
+- You will need to add AzureMonitor-Containers solution to your workspace from #1 above
+
+2. [Add the 'AzureMonitor-Containers' Solution to your Log Analytics workspace.](http://aka.ms/coinhelmdoc)
+
+3. [For AKS-Engine or ACS-Engine K8S cluster hosted in Azure, add required tags on cluster resources, to be able to use Azure Container monitoring User experience (aka.ms/azmon-containers)](http://aka.ms/coin-acs-tag-doc)
+ > Note: Pre-requisite #3 not applicable for the AKS-Engine or ACS-Engine clusters hosted in Azure Stack or On-premise.
+
+---
+
+## Installing the Chart
+
+> Note: If you want to customize the chart, fork the chart code in https://github.com/microsoft/Docker-Provider/tree/ci_prod/charts/azuremonitor-containers
+
+> Note: `--name` flag not required in Helm3 since this flag is deprecated
+
+> Note: use `omsagent.proxy` parameter to set the proxy endpoint if your K8s cluster configured behind the proxy. Refer to [configure proxy](#Configuring-Proxy-Endpoint) for more details about proxy.
+
+### To Use Azure Log Analytics Workspace in Public Cloud
+
+```bash
+$ helm repo add microsoft https://microsoft.github.io/charts/repo
+$ helm install --name azmon-containers-release-1 \
+--set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers
+```
+
+### To Use Azure Log Analytics Workspace in Azure China Cloud
+
+```bash
+$ helm repo add microsoft https://microsoft.github.io/charts/repo
+$ helm install --name azmon-containers-release-1 \
+--set omsagent.domain=opinsights.azure.cn,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers
+```
+
+### To Use Azure Log Analytics Workspace in Azure US Government Cloud
+
+```bash
+$ helm repo add microsoft https://microsoft.github.io/charts/repo
+$ helm install --name azmon-containers-release-1 \
+--set omsagent.domain=opinsights.azure.us,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers
+```
+
+## Upgrading an existing Release to a new version
+
+If the previous version of the chart installed with Helm2, it can be upgraded successfully to current version using Helm2.
+But, if the previous version of chart installed with the Helm3 or release migrated to Helm3,then chart can�t be upgraded to latest version due to issues in Helm3 with regards to upgrading the existing release to new version, as described in [Helm issue #6850](https://github.com/helm/helm/issues/6850)
+
+## Uninstalling the Chart
+
+To uninstall/delete the `azmon-containers-release-1` release:
+> Note: `--purge` flag not required in Helm3 since this flag deprecated
+```bash
+
+$ helm del --purge azmon-containers-release-1
+
+```
+The command removes all the Kubernetes components associated with the chart and deletes the release.
+
+## Configuration
+
+The following table lists the configurable parameters of the MSOMS chart and their default values.
+
+The following table lists the configurable parameters of the MSOMS chart and their default values.
+
+| Parameter | Description | Default |
+| ----------------------- | --------------------------------------------------------| --------------------------------------------------------------------------------------------------------------------------- |
+| `omsagent.image.tag` | image tag for Linux Agent. | Most recent release |
+| `omsagent.image.tagWindows` | image tag for Windows Agent. | Most recent release |
+| `omsagent.image.imagerepo` | image repo for Liunx & Windows. | For Public and US Govt cloud: mcr.microsoft.com/azuremonitor/containerinsights/ciprod and For China Cloud: mcr.azk8s.cn/azuremonitor/containerinsights/ciprod
+| `omsagent.image.pullPolicy` | image pull policy for the agent. | IfNotPresent |
+| `omsagent.secret.wsid` | Azure Log analytics workspace id | Does not have a default value, needs to be provided |
+| `omsagent.secret.key` | Azure Log analytics workspace key | Does not have a default value, needs to be provided |
+| `omsagent.domain` | Azure Log analytics cloud domain (public,china, us govt)| opinsights.azure.com (Public cloud as default), opinsights.azure.cn (China Cloud), opinsights.azure.us (US Govt Cloud) |
+| `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided |
+| `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) |
+| `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) |
+
+> Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName`
+
+### Note
+
+- Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap.
+- onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName
+
+## Agent data collection settings
+
+Staring with chart version 1.0.0, agent data collection settings are controlled thru a config map. Refer to documentation about agent data collection settings [here](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-agent-config)
+
+You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux.
+
+Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example,
+
+
+```bash
+
+$ helm install --name myrelease-1 \
+--set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName=
+ microsoft/azuremonitor-containers-highscale-private-preview
+```
+Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example,
+
+```bash
+
+$ helm install --name myrelease-1 -f values.yaml microsoft/azuremonitor-containers-highscale-private-preview
+
+```
+
+After you successfully deploy the chart, you will be able to see your data in
+- [azure public cloud portal](https://aka.ms/azmon-containers) for the clusters in Azure Public Cloud
+- [azure china cloud portal](https://aka.ms/azmon-containers-mooncake) for the clusters in Azure China Cloud
+- [azure us government cloud portal](https://aka.ms/azmon-containers-fairfax) for the clusters in Azure US Government Cloud
+
+If you need help with this chart, please reach us out through [this](mailto:askcoin@microsoft.com) email.
+
+## Custom resource
+
+Starting with chart version 2.0.0, chart will create a CRD (healthstates.azmon.container.insights) in kube-system namespace. This is used by the agent for cluster health monitoring.
+## Container Runtime(s)
+
+Starting with chart version 2.7.0, chart will support Container Runtime Interface(CRI) compatiable runtimes such as CRI-O and ContainerD etc. in addition to Docker/Moby.
+
+## Configuring Proxy Endpoint
+
+Starting with chart version 2.7.1, chart will support specifying the Proxy endpoint via `omsagent.proxy` chart parameter so that all remote outbound traffic will be routed via configured proxy endpoint.
+
+Communication between the Azure Monitor for containers agent and Azure Monitor backend can use an HTTP or HTTPS proxy server.
+
+Both anonymous and basic authentication (username/password) proxies are supported.
+
+The proxy configuration value has the following syntax:
+[protocol://][user:password@]proxyhost[:port]
+
+Property|Description
+-|-
+Protocol|http or https
+user|username for proxy authentication
+password|password for proxy authentication
+proxyhost|Address or FQDN of the proxy server
+port|port number for the proxy server
+
+For example:
+`omsagent.proxy=http://user01:password@proxy01.contoso.com:8080`
+
+> Note: Although you do not have any user/password set for the proxy, you will still need to add a psuedo user/password. This can be any username or password.
+
+The Azure Monitor for containers agent only creates secure connection over http.
+Even if you specify the protocol as http, please note that http requests are created using SSL/TLS secure connection so the proxy must support SSL/TLS.
+
+## Support for Windows Container Logs
+
+Starting with chart version 2.7.1, chart deploys the daemonset on windows nodes which collects std{out;err} logs of the containers running on windows nodes.
+
+## Ux
+
+Once the Azure Monitor for containers chart successfully onboarded, you should be able to view insights of your cluster [Azure Portal](http://aka.ms/azmon-containers)
+
+# Contact
+
+If you have any questions or feedback regarding the container monitoring addon, please reach us out through [this](askcoin@microsoft.com) email.
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/NOTES.txt b/charts/azuremonitor-containers-highscale-private-preview/templates/NOTES.txt
new file mode 100644
index 000000000..48ebf33fc
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/NOTES.txt
@@ -0,0 +1,51 @@
+{{- if eq .Values.omsagent.secret.wsid "" }}
+
+##############################################################################
+#### ERROR: You did not provide Azure Log Analytics workspace ID. ####
+##############################################################################
+
+{{- end }}
+
+{{- if eq .Values.omsagent.secret.key "" }}
+
+##############################################################################
+#### ERROR: You did not provide Azure Log Analytics workspace key ####
+##############################################################################
+
+{{- end }}
+
+{{- if and (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") }}
+
+##############################################################################
+#### ERROR: You did not provide cluster name ####
+##############################################################################
+
+{{- end }}
+
+{{- if or (eq .Values.omsagent.secret.key "") (eq .Values.omsagent.secret.wsid "") (and (eq .Values.omsagent.env.clusterName "") (eq .Values.omsagent.env.clusterId "") (eq .Values.Azure.Cluster.ResourceId "") )}}
+
+This deployment will not complete. To proceed, run
+ helm upgrade {{ .Release.Name }} \
+ --set omsagent.secret.wsid= \
+ --set omsagent.secret.key= \
+ --set omsagent.env.clusterName= \
+ microsoft/azuremonitor-containers
+
+{{- else -}}
+
+{{ .Release.Name }} deployment is complete.
+{{ .Values.omsagent.domain}} is configured Azure Log Analytics Workspace Domain.
+Data should start flowing to the Log Analytics workspace shortly.
+Proceed to below link to view health and monitoring data of your clusters
+{{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.com" }}
+- Azure Public Cloud Portal URL : https://aka.ms/azmon-containers
+{{- else if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }}
+- Azure China Cloud Portal URL: https://aka.ms/azmon-containers-mooncake
+{{- else if eq (.Values.omsagent.domain | lower) "opinsights.azure.us" }}
+- Azure US Government Cloud Portal URL: https://aka.ms/azmon-containers-fairfax
+{{- else }}
+########################################################################################
+#### ERROR: Provided omsagent.domain value either invalid or not supported ####
+########################################################################################
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/_helpers.tpl b/charts/azuremonitor-containers-highscale-private-preview/templates/_helpers.tpl
new file mode 100644
index 000000000..d6ae95ae1
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/_helpers.tpl
@@ -0,0 +1,32 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "azuremonitor-containers.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "azuremonitor-containers.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "azuremonitor-containers.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-arc-k8s-crd.yaml
new file mode 100644
index 000000000..b7482b8b5
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-arc-k8s-crd.yaml
@@ -0,0 +1,26 @@
+{{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }}
+#extension model
+{{- if not (empty .Values.Azure.Extension.Name) }}
+apiVersion: clusterconfig.azure.com/v1beta1
+kind: AzureExtensionIdentity
+metadata:
+ name: {{ .Values.Azure.Extension.Name }}
+ namespace: azure-arc
+spec:
+ serviceAccounts:
+ - name: omsagent
+ namespace: kube-system
+ tokenNamespace: azure-arc
+---
+{{- end }}
+apiVersion: clusterconfig.azure.com/v1beta1
+kind: AzureClusterIdentityRequest
+metadata:
+ name: container-insights-clusteridentityrequest
+ namespace: azure-arc
+spec:
+ audience: https://monitoring.azure.com/
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ resourceId: {{ .Values.Azure.Extension.Name }}
+ {{- end }}
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-crd.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-crd.yaml
new file mode 100644
index 000000000..bbaf89a52
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-crd.yaml
@@ -0,0 +1,36 @@
+{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }}
+apiVersion: apiextensions.k8s.io/v1beta1
+kind: CustomResourceDefinition
+metadata:
+ name: healthstates.azmon.container.insights
+ namespace: kube-system
+spec:
+ group: azmon.container.insights
+ version: v1
+ scope: Namespaced
+ names:
+ plural: healthstates
+ kind: HealthState
+{{- else }}
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+ name: healthstates.azmon.container.insights
+ namespace: kube-system
+spec:
+ group: azmon.container.insights
+ versions:
+ - name: v1
+ served: true
+ storage: true
+ schema:
+ openAPIV3Schema:
+ type: object
+ properties:
+ state:
+ type: string
+ scope: Namespaced
+ names:
+ plural: healthstates
+ kind: HealthState
+{{- end }}
\ No newline at end of file
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset-windows.yaml
new file mode 100644
index 000000000..82d210f3d
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset-windows.yaml
@@ -0,0 +1,126 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: omsagent-win
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ component: oms-agent-win
+ tier: node-win
+spec:
+ updateStrategy:
+ type: RollingUpdate
+ selector:
+ matchLabels:
+ dsName: "omsagent-ds"
+ template:
+ metadata:
+ labels:
+ dsName: "omsagent-ds"
+ annotations:
+ agentVersion: {{ .Values.omsagent.image.tagWindows }}
+ dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }}
+ schema-versions: "v1"
+ checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }}
+ checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }}
+ spec:
+ priorityClassName: omsagent
+ dnsConfig:
+ options:
+ - name: ndots
+ value: "3"
+{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }}
+ nodeSelector:
+ kubernetes.io/os: windows
+{{- else }}
+ nodeSelector:
+ beta.kubernetes.io/os: windows
+{{- end }}
+ {{- if .Values.omsagent.rbac }}
+ serviceAccountName: omsagent
+ {{- end }}
+ containers:
+ - name: omsagent-win
+ {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }}
+ image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tagWindows }}"
+ {{- else }}
+ image: {{ printf "%s:%s" .Values.omsagent.image.repo .Values.omsagent.image.tagWindows }}
+ {{- end }}
+ imagePullPolicy: IfNotPresent
+ resources:
+{{ toYaml .Values.omsagent.resources.daemonsetwindows | indent 9 }}
+ env:
+ {{- if ne .Values.omsagent.env.clusterId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.omsagent.env.clusterId | quote }}
+ {{- if ne .Values.omsagent.env.clusterRegion "" }}
+ - name: AKS_REGION
+ value: {{ .Values.omsagent.env.clusterRegion | quote }}
+ {{- end }}
+ {{- else if ne .Values.Azure.Cluster.ResourceId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.Azure.Cluster.ResourceId | quote }}
+ {{- if ne .Values.Azure.Cluster.Region "" }}
+ - name: AKS_REGION
+ value: {{ .Values.Azure.Cluster.Region | quote }}
+ {{- end }}
+ {{- else }}
+ - name: ACS_RESOURCE_NAME
+ value: {{ .Values.omsagent.env.clusterName | quote }}
+ {{- end }}
+ - name: CONTROLLER_TYPE
+ value: "DaemonSet"
+ - name: HOSTNAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ - name: NODE_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.hostIP
+ volumeMounts:
+ - mountPath: C:\ProgramData\docker\containers
+ name: docker-windows-containers
+ readOnly: true
+ - mountPath: C:\var #Read + Write access on this for position file
+ name: docker-windows-kuberenetes-container-logs
+ - mountPath: C:\etc\config\settings
+ name: settings-vol-config
+ readOnly: true
+ - mountPath: C:\etc\omsagent-secret
+ name: omsagent-secret
+ readOnly: true
+ livenessProbe:
+ exec:
+ command:
+ - cmd
+ - /c
+ - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd
+ periodSeconds: 60
+ initialDelaySeconds: 180
+ timeoutSeconds: 15
+ {{- with .Values.omsagent.tolerations }}
+ tolerations: {{- toYaml . | nindent 8 }}
+ {{- end }}
+ volumes:
+ - name: docker-windows-kuberenetes-container-logs
+ hostPath:
+ path: C:\var
+ - name: docker-windows-containers
+ hostPath:
+ path: C:\ProgramData\docker\containers
+ - name: settings-vol-config
+ configMap:
+ name: container-azm-ms-agentconfig
+ optional: true
+ - name: omsagent-secret
+ secret:
+ secretName: omsagent-secret
+ - name: omsagent-adx-secret
+ secret:
+ secretName: omsagent-adx-secret
+ optional: true
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset.yaml
new file mode 100644
index 000000000..0272c6263
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-daemonset.yaml
@@ -0,0 +1,173 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: omsagent
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ component: oms-agent
+ tier: node
+spec:
+ updateStrategy:
+ type: RollingUpdate
+ selector:
+ matchLabels:
+ dsName: "omsagent-ds"
+ template:
+ metadata:
+ labels:
+ dsName: "omsagent-ds"
+ annotations:
+ agentVersion: {{ .Values.omsagent.image.tag }}
+ dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }}
+ schema-versions: "v1"
+ checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }}
+ checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }}
+ checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }}
+ spec:
+ priorityClassName: omsagent
+ dnsConfig:
+ options:
+ - name: ndots
+ value: "3"
+ {{- if .Values.omsagent.rbac }}
+ serviceAccountName: omsagent
+ {{- end }}
+ containers:
+ - name: omsagent
+ {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }}
+ image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tag }}"
+ {{- else }}
+ image: {{ printf "%s:%s" .Values.omsagent.image.repo .Values.omsagent.image.tag }}
+ {{- end }}
+ imagePullPolicy: IfNotPresent
+ resources:
+{{ toYaml .Values.omsagent.resources.daemonsetlinux | indent 9 }}
+ env:
+ {{- if ne .Values.omsagent.env.clusterId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.omsagent.env.clusterId | quote }}
+ {{- if ne .Values.omsagent.env.clusterRegion "" }}
+ - name: AKS_REGION
+ value: {{ .Values.omsagent.env.clusterRegion | quote }}
+ {{- end }}
+ {{- else if ne .Values.Azure.Cluster.ResourceId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.Azure.Cluster.ResourceId | quote }}
+ {{- if ne .Values.Azure.Cluster.Region "" }}
+ - name: AKS_REGION
+ value: {{ .Values.Azure.Cluster.Region | quote }}
+ {{- end }}
+ {{- else }}
+ - name: ACS_RESOURCE_NAME
+ value: {{ .Values.omsagent.env.clusterName | quote }}
+ {{- end }}
+ - name: CONTROLLER_TYPE
+ value: "DaemonSet"
+ - name: NODE_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.hostIP
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ - name: ARC_K8S_EXTENSION_NAME
+ value: {{ .Values.Azure.Extension.Name | quote }}
+ {{- end }}
+ - name: USER_ASSIGNED_IDENTITY_CLIENT_ID
+ value: ""
+ {{- if .Values.omsagent.logsettings.logflushintervalsecs }}
+ - name: FBIT_SERVICE_FLUSH_INTERVAL
+ value: {{ .Values.omsagent.logsettings.logflushintervalsecs | quote }}
+ {{- end }}
+ {{- if .Values.omsagent.logsettings.tailbufchunksizemegabytes }}
+ - name: FBIT_TAIL_BUFFER_CHUNK_SIZE
+ value: {{ .Values.omsagent.logsettings.tailbufchunksizemegabytes | quote }}
+ {{- end }}
+ {{- if .Values.omsagent.logsettings.tailbufmaxsizemegabytes }}
+ - name: FBIT_TAIL_BUFFER_MAX_SIZE
+ value: {{ .Values.omsagent.logsettings.tailbufmaxsizemegabytes | quote }}
+ {{- end }}
+ securityContext:
+ privileged: true
+ ports:
+ - containerPort: 25225
+ protocol: TCP
+ - containerPort: 25224
+ protocol: UDP
+ volumeMounts:
+ - mountPath: /hostfs
+ name: host-root
+ readOnly: true
+ - mountPath: /var/run/host
+ name: docker-sock
+ - mountPath: /var/log
+ name: host-log
+ - mountPath: /var/lib/docker/containers
+ name: containerlog-path
+ - mountPath: /etc/kubernetes/host
+ name: azure-json-path
+ - mountPath: /etc/omsagent-secret
+ name: omsagent-secret
+ readOnly: true
+ - mountPath: /etc/config/settings
+ name: settings-vol-config
+ readOnly: true
+ {{- if .Values.omsagent.logsettings.custommountpath }}
+ - mountPath: {{ .Values.omsagent.logsettings.custommountpath }}
+ name: custom-mount-path
+ {{- end }}
+ - mountPath: /etc/config/settings/adx
+ name: omsagent-adx-secret
+ readOnly: true
+ livenessProbe:
+ exec:
+ command:
+ - /bin/bash
+ - -c
+ - "/opt/livenessprobe.sh"
+ initialDelaySeconds: 60
+ periodSeconds: 60
+ {{- with .Values.omsagent.daemonset.affinity }}
+ affinity: {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.omsagent.tolerations }}
+ tolerations: {{- toYaml . | nindent 8 }}
+ {{- end }}
+ volumes:
+ - name: host-root
+ hostPath:
+ path: /
+ - name: docker-sock
+ hostPath:
+ path: /var/run
+ - name: container-hostname
+ hostPath:
+ path: /etc/hostname
+ - name: host-log
+ hostPath:
+ path: /var/log
+ - name: containerlog-path
+ hostPath:
+ path: /var/lib/docker/containers
+ - name: azure-json-path
+ hostPath:
+ path: /etc/kubernetes
+ - name: omsagent-secret
+ secret:
+ secretName: omsagent-secret
+ - name: settings-vol-config
+ configMap:
+ name: container-azm-ms-agentconfig
+ optional: true
+ {{- if .Values.omsagent.logsettings.custommountpath }}
+ - name: custom-mount-path
+ hostPath:
+ path: {{ .Values.omsagent.logsettings.custommountpath }}
+ {{- end }}
+ - name: omsagent-adx-secret
+ secret:
+ secretName: omsagent-adx-secret
+ optional: true
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-deployment.yaml
new file mode 100644
index 000000000..eb3a06f9d
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-deployment.yaml
@@ -0,0 +1,159 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: omsagent-rs
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ component: oms-agent
+ tier: node
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ rsName: "omsagent-rs"
+ strategy:
+ type: RollingUpdate
+ template:
+ metadata:
+ labels:
+ rsName: "omsagent-rs"
+ annotations:
+ agentVersion: {{ .Values.omsagent.image.tag }}
+ dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }}
+ schema-versions: "v1"
+ checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }}
+ checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }}
+ checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }}
+ spec:
+ {{- if .Values.omsagent.rbac }}
+ serviceAccountName: omsagent
+ {{- end }}
+ containers:
+ - name: omsagent
+ {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }}
+ image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.rsimage.tag }}"
+ {{- else }}
+ image: {{ printf "%s:%s" .Values.omsagent.rsimage.repo .Values.omsagent.rsimage.tag }}
+ {{- end }}
+ imagePullPolicy: IfNotPresent
+ resources:
+{{ toYaml .Values.omsagent.resources.deployment | indent 9 }}
+ env:
+ {{- if ne .Values.omsagent.env.clusterId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.omsagent.env.clusterId | quote }}
+ {{- if ne .Values.omsagent.env.clusterRegion "" }}
+ - name: AKS_REGION
+ value: {{ .Values.omsagent.env.clusterRegion | quote }}
+ {{- end }}
+ {{- else if ne .Values.Azure.Cluster.ResourceId "" }}
+ - name: AKS_RESOURCE_ID
+ value: {{ .Values.Azure.Cluster.ResourceId | quote }}
+ {{- if ne .Values.Azure.Cluster.Region "" }}
+ - name: AKS_REGION
+ value: {{ .Values.Azure.Cluster.Region | quote }}
+ {{- end }}
+ {{- else }}
+ - name: ACS_RESOURCE_NAME
+ value: {{ .Values.omsagent.env.clusterName | quote }}
+ {{- end }}
+ - name: CONTROLLER_TYPE
+ value: "ReplicaSet"
+ - name: NODE_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.hostIP
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ - name: ARC_K8S_EXTENSION_NAME
+ value: {{ .Values.Azure.Extension.Name | quote }}
+ {{- end }}
+ - name: USER_ASSIGNED_IDENTITY_CLIENT_ID
+ value: ""
+ securityContext:
+ privileged: true
+ ports:
+ - containerPort: 25225
+ protocol: TCP
+ - containerPort: 25224
+ protocol: UDP
+ - containerPort: 25227
+ protocol: TCP
+ name: in-rs-tcp
+ volumeMounts:
+ - mountPath: /var/run/host
+ name: docker-sock
+ - mountPath: /var/log
+ name: host-log
+ - mountPath: /var/lib/docker/containers
+ name: containerlog-path
+ - mountPath: /etc/kubernetes/host
+ name: azure-json-path
+ - mountPath: /etc/omsagent-secret
+ name: omsagent-secret
+ readOnly: true
+ - mountPath : /etc/config
+ name: omsagent-rs-config
+ - mountPath: /etc/config/settings
+ name: settings-vol-config
+ readOnly: true
+ {{- if .Values.omsagent.logsettings.custommountpath }}
+ - mountPath: {{ .Values.omsagent.logsettings.custommountpath }}
+ name: custom-mount-path
+ {{- end }}
+ - mountPath: /etc/config/settings/adx
+ name: omsagent-adx-secret
+ readOnly: true
+ livenessProbe:
+ exec:
+ command:
+ - /bin/bash
+ - -c
+ - "/opt/livenessprobe.sh"
+ initialDelaySeconds: 60
+ periodSeconds: 60
+ {{- with .Values.omsagent.deployment.affinity }}
+ affinity: {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.omsagent.tolerations }}
+ tolerations: {{- toYaml . | nindent 8 }}
+ {{- end }}
+ volumes:
+ - name: docker-sock
+ hostPath:
+ path: /var/run
+ - name: container-hostname
+ hostPath:
+ path: /etc/hostname
+ - name: host-log
+ hostPath:
+ path: /var/log
+ - name: containerlog-path
+ hostPath:
+ path: /var/lib/docker/containers
+ - name: azure-json-path
+ hostPath:
+ path: /etc/kubernetes
+ - name: omsagent-secret
+ secret:
+ secretName: omsagent-secret
+ - name: omsagent-rs-config
+ configMap:
+ name: omsagent-rs-config
+ - name: settings-vol-config
+ configMap:
+ name: container-azm-ms-agentconfig
+ optional: true
+ {{- if .Values.omsagent.logsettings.custommountpath }}
+ - name: custom-mount-path
+ hostPath:
+ path: {{ .Values.omsagent.logsettings.custommountpath }}
+ {{- end }}
+ - name: omsagent-adx-secret
+ secret:
+ secretName: omsagent-adx-secret
+ optional: true
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-priorityclass.yaml
new file mode 100644
index 000000000..4d9980ab3
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-priorityclass.yaml
@@ -0,0 +1,22 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+# This pod priority class is used for daemonsets to allow them to have priority
+# over pods that can be scheduled elsewhere. Without a priority class, it is
+# possible for a node to fill up with pods before the daemonset pods get to be
+# created for the node or get scheduled. Note that pods are not "daemonset"
+# pods - they are just pods created by the daemonset controller but they have
+# a specific affinity set during creation to the specific node each pod was
+# created to run on (daemonset controller takes care of that)
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+ name: omsagent
+ # Priority classes don't have labels :-)
+ annotations:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ component: oms-agent
+value: {{ .Values.omsagent.priority }}
+globalDefault: false
+description: "This is the daemonset priority class for omsagent"
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rbac.yaml
new file mode 100644
index 000000000..5db5c2dab
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rbac.yaml
@@ -0,0 +1,61 @@
+{{- if .Values.omsagent.rbac }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: omsagent
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1beta1
+metadata:
+ name: omsagent-reader
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+rules:
+- apiGroups: [""]
+ resources: ["pods", "events", "nodes", "nodes/stats", "nodes/metrics", "nodes/spec", "nodes/proxy", "namespaces", "services", "persistentvolumes"]
+ verbs: ["list", "get", "watch"]
+- apiGroups: ["apps", "extensions", "autoscaling"]
+ resources: ["replicasets", "deployments", "horizontalpodautoscalers"]
+ verbs: ["list"]
+- apiGroups: ["azmon.container.insights"]
+ resources: ["healthstates"]
+ verbs: ["get", "create", "patch"]
+- apiGroups: ["clusterconfig.azure.com"]
+ resources: ["azureclusteridentityrequests"]
+ resourceNames: ["container-insights-clusteridentityrequest"]
+ verbs: ["get", "create", "patch"]
+- nonResourceURLs: ["/metrics"]
+ verbs: ["get"]
+#arc k8s extension model grants access as part of the extension msi
+#remove this explicit permission once the extension available in public preview
+{{- if (empty .Values.Azure.Extension.Name) }}
+- apiGroups: [""]
+ resources: ["secrets"]
+ resourceNames: ["container-insights-clusteridentityrequest-token"]
+ verbs: ["get"]
+{{- end }}
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1beta1
+metadata:
+ name: omsagentclusterrolebinding
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+subjects:
+ - kind: ServiceAccount
+ name: omsagent
+ namespace: kube-system
+roleRef:
+ kind: ClusterRole
+ name: omsagent-reader
+ apiGroup: rbac.authorization.k8s.io
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rs-configmap.yaml
new file mode 100644
index 000000000..de4a877cf
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-rs-configmap.yaml
@@ -0,0 +1,260 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+kind: ConfigMap
+apiVersion: v1
+data:
+ kube.conf: |
+ # Fluentd config file for OMS Docker - cluster components (kubeAPI)
+ #fluent forward plugin
+
+ type forward
+ port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
+ bind 0.0.0.0
+ chunk_size_limit 4m
+
+
+ #Kubernetes Persistent Volume inventory
+
+ type kubepvinventory
+ tag oms.containerinsights.KubePVInventory
+ run_interval 60
+ log_level debug
+
+
+ #Kubernetes events
+
+ type kubeevents
+ tag oms.containerinsights.KubeEvents
+ run_interval 60
+ log_level debug
+
+
+ #Kubernetes Nodes
+
+ type kubenodeinventory
+ tag oms.containerinsights.KubeNodeInventory
+ run_interval 60
+ log_level debug
+
+
+ #Kubernetes health
+
+ type kubehealth
+ tag kubehealth.ReplicaSet
+ run_interval 60
+ log_level debug
+
+
+ #cadvisor perf- Windows nodes
+
+ type wincadvisorperf
+ tag oms.api.wincadvisorperf
+ run_interval 60
+ log_level debug
+
+
+ #Kubernetes object state - deployments
+
+ type kubestatedeployments
+ tag oms.containerinsights.KubeStateDeployments
+ run_interval 60
+ log_level debug
+
+
+ #Kubernetes object state - HPA
+
+ type kubestatehpa
+ tag oms.containerinsights.KubeStateHpa
+ run_interval 60
+ log_level debug
+
+
+
+ type filter_inventory2mdm
+ log_level info
+
+
+ #custom_metrics_mdm filter plugin for perf data from windows nodes
+
+ type filter_cadvisor2mdm
+ metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
+ log_level info
+
+
+ #health model aggregation filter
+
+ type filter_health_model_builder
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 2
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 2
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 2
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 3
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer
+ buffer_queue_limit 20
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 2
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_mdm
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_mdm_*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+ retry_mdm_post_wait_minutes 30
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_mdm
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+ retry_mdm_post_wait_minutes 30
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
+metadata:
+ name: omsagent-rs-config
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-secret.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-secret.yaml
new file mode 100644
index 000000000..1a7f087ed
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-secret.yaml
@@ -0,0 +1,19 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+apiVersion: v1
+kind: Secret
+metadata:
+ name: omsagent-secret
+ namespace: kube-system
+ labels:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+type: Opaque
+data:
+ WSID: {{ required "A valid workspace id is required!" .Values.omsagent.secret.wsid | b64enc | quote }}
+ KEY: {{ required "A valid workspace key is required!" .Values.omsagent.secret.key | b64enc | quote }}
+ DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }}
+ {{- if ne .Values.omsagent.proxy "" }}
+ PROXY: {{ .Values.omsagent.proxy | b64enc | quote }}
+ {{- end }}
+{{- end }}
diff --git a/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-service.yaml b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-service.yaml
new file mode 100644
index 000000000..00e6a1d3b
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/templates/omsagent-service.yaml
@@ -0,0 +1,12 @@
+kind: Service
+apiVersion: v1
+metadata:
+ name: healthmodel-replicaset-service
+ namespace: kube-system
+spec:
+ selector:
+ rsName: "omsagent-rs"
+ ports:
+ - protocol: TCP
+ port: 25227
+ targetPort: in-rs-tcp
diff --git a/charts/azuremonitor-containers-highscale-private-preview/values.yaml b/charts/azuremonitor-containers-highscale-private-preview/values.yaml
new file mode 100644
index 000000000..a3f32f785
--- /dev/null
+++ b/charts/azuremonitor-containers-highscale-private-preview/values.yaml
@@ -0,0 +1,185 @@
+# Default values for azuremonitor-containers-highscale-private-preview.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+## Microsoft OMS Agent image for kubernetes cluster monitoring
+## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod
+## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension
+Azure:
+ Cluster:
+ Region:
+ ResourceId:
+ Extension:
+ Name: ""
+ ResourceId: ""
+omsagent:
+ image:
+ repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
+ tag: "ciprod01112021"
+ tagWindows: "win-ciprod01112021"
+ pullPolicy: IfNotPresent
+ dockerProviderVersion: "12.0.0-0"
+ agentVersion: "1.10.0.1"
+ rsimage:
+ repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
+ tag: "ciprod01112021"
+ tagWindows: "win-ciprod01112021"
+ pullPolicy: IfNotPresent
+ dockerProviderVersion: "12.0.0-0"
+ agentVersion: "1.10.0.1"
+
+ # The priority used by the omsagent priority class for the daemonset pods
+ # Note that this is not execution piority - it is scheduling priority, as
+ # in getting scheduled to the node. This needs to be greater than 0 such
+ # that the daemonset pods, which can not schedule onto different nodes as
+ # they are defined to run on specific nodes, are not accidentally frozen
+ # out of a node due to other pods showing up earlier in scheduling.
+ # (DaemonSet pods by definition only are created once the node exists for
+ # them to be created for and thus it is possible to have "normal" pods
+ # already in line to run on the node before the DeamonSet controller got a
+ # chance to build pod for the node and give it to the scheduler)
+ # Should be some number greater than default (0)
+ priority: 10
+
+ ## To get your workspace id and key do the following
+ ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux.
+
+ secret:
+ wsid:
+ key:
+ domain: opinsights.azure.com
+ proxy:
+ env:
+ clusterName:
+ ## Applicable for only managed clusters hosted in Azure
+ clusterId:
+ clusterRegion:
+ rbac: true
+ logsettings:
+ logflushintervalsecs: ""
+ tailbufchunksizemegabytes: ""
+ tailbufmaxsizemegabytes: ""
+ ## Applicable for only Azure Stack Edge K8s since it has custom mount path for container logs which will have symlink to /var/log path
+ custommountpath: ""
+
+ ## Configure node tolerations for scheduling onto nodes with taints
+ ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
+ ##
+ tolerations:
+ - operator: "Exists"
+ effect: "NoSchedule"
+ - operator: "Exists"
+ effect: "NoExecute"
+ - operator: "Exists"
+ effect: "PreferNoSchedule"
+
+ ## Pod scheduling preferences.
+ ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
+ ##
+ daemonset:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - labelSelector:
+ matchExpressions:
+ - key: kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ - key: type
+ operator: NotIn
+ values:
+ - virtual-kubelet
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ nodeSelectorTerms:
+ - labelSelector:
+ matchExpressions:
+ - key: beta.kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ - key: type
+ operator: NotIn
+ values:
+ - virtual-kubelet
+ - key: beta.kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ deployment:
+ affinity:
+ nodeAffinity:
+ # affinity to schedule on to ephemeral os node if its available
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 1
+ preference:
+ matchExpressions:
+ - key: storageprofile
+ operator: NotIn
+ values:
+ - managed
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - labelSelector:
+ matchExpressions:
+ - key: kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ - key: type
+ operator: NotIn
+ values:
+ - virtual-kubelet
+ - key: kubernetes.io/role
+ operator: NotIn
+ values:
+ - master
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ nodeSelectorTerms:
+ - labelSelector:
+ matchExpressions:
+ - key: beta.kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ - key: type
+ operator: NotIn
+ values:
+ - virtual-kubelet
+ - key: kubernetes.io/role
+ operator: NotIn
+ values:
+ - master
+ - key: beta.kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ ## Configure resource requests and limits
+ ## ref: http://kubernetes.io/docs/user-guide/compute-resources/
+ ##
+ resources:
+ daemonsetlinux:
+ requests:
+ cpu: 75m
+ memory: 225Mi
+ limits:
+ cpu: 150m
+ memory: 600Mi
+ daemonsetwindows:
+ limits:
+ cpu: 200m
+ memory: 600Mi
+ deployment:
+ requests:
+ cpu: 150m
+ memory: 250Mi
+ limits:
+ cpu: 1
+ memory: 1Gi
diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md
index 469fac94a..a3f17b509 100644
--- a/charts/azuremonitor-containers/README.md
+++ b/charts/azuremonitor-containers/README.md
@@ -93,6 +93,7 @@ The following table lists the configurable parameters of the MSOMS chart and the
| `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided |
| `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) |
| `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) |
+| `omsagent.priority` | DaemonSet Pod Priority | This is the [priority](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/) to use for the daemonsets such that they get scheduled onto the node ahead of "normal" pods - must be an integer, defaults to 10 |
> Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName`
@@ -100,6 +101,7 @@ The following table lists the configurable parameters of the MSOMS chart and the
- Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap.
- onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName
+- The `omsagent.priority` parameter sets the priority of the omsagent daemonset priority class. This pod priority class is used for daemonsets to allow them to have priority over pods that can be scheduled elsewhere. Without a priority class, it is possible for a node to fill up with "normal" pods before the daemonset pods get to be created for the node or get scheduled. Note that pods are not "daemonset" pods - they are just pods created by the daemonset controller but they have a specific affinity set during creation to the specific node each pod was created to run on. You want this value to be greater than 0 (default is 10) and generally greater than pods that have the flexibility to run on different nodes such that they do not block the node specific pods.
## Agent data collection settings
diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml
index ebdd5ea3f..b7482b8b5 100644
--- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml
@@ -1,4 +1,18 @@
{{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }}
+#extension model
+{{- if not (empty .Values.Azure.Extension.Name) }}
+apiVersion: clusterconfig.azure.com/v1beta1
+kind: AzureExtensionIdentity
+metadata:
+ name: {{ .Values.Azure.Extension.Name }}
+ namespace: azure-arc
+spec:
+ serviceAccounts:
+ - name: omsagent
+ namespace: kube-system
+ tokenNamespace: azure-arc
+---
+{{- end }}
apiVersion: clusterconfig.azure.com/v1beta1
kind: AzureClusterIdentityRequest
metadata:
@@ -6,4 +20,7 @@ metadata:
namespace: azure-arc
spec:
audience: https://monitoring.azure.com/
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ resourceId: {{ .Values.Azure.Extension.Name }}
+ {{- end }}
{{- end }}
diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml
index 81003c704..82d210f3d 100644
--- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml
@@ -27,10 +27,11 @@ spec:
checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }}
checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }}
spec:
- dnsConfig:
+ priorityClassName: omsagent
+ dnsConfig:
options:
- name: ndots
- value: "3"
+ value: "3"
{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }}
nodeSelector:
kubernetes.io/os: windows
diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml
index 3d29ede42..0272c6263 100644
--- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml
@@ -28,10 +28,11 @@ spec:
checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }}
checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }}
spec:
- dnsConfig:
+ priorityClassName: omsagent
+ dnsConfig:
options:
- name: ndots
- value: "3"
+ value: "3"
{{- if .Values.omsagent.rbac }}
serviceAccountName: omsagent
{{- end }}
@@ -70,6 +71,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ - name: ARC_K8S_EXTENSION_NAME
+ value: {{ .Values.Azure.Extension.Name | quote }}
+ {{- end }}
- name: USER_ASSIGNED_IDENTITY_CLIENT_ID
value: ""
{{- if .Values.omsagent.logsettings.logflushintervalsecs }}
diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
index 8609d25c9..eb3a06f9d 100644
--- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
@@ -35,9 +35,9 @@ spec:
containers:
- name: omsagent
{{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }}
- image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tag }}"
+ image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.rsimage.tag }}"
{{- else }}
- image: {{ printf "%s:%s" .Values.omsagent.image.repo .Values.omsagent.image.tag }}
+ image: {{ printf "%s:%s" .Values.omsagent.rsimage.repo .Values.omsagent.rsimage.tag }}
{{- end }}
imagePullPolicy: IfNotPresent
resources:
@@ -67,8 +67,12 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
+ {{- if not (empty .Values.Azure.Extension.Name) }}
+ - name: ARC_K8S_EXTENSION_NAME
+ value: {{ .Values.Azure.Extension.Name | quote }}
+ {{- end }}
- name: USER_ASSIGNED_IDENTITY_CLIENT_ID
- value: ""
+ value: ""
securityContext:
privileged: true
ports:
diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml
new file mode 100644
index 000000000..4d9980ab3
--- /dev/null
+++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml
@@ -0,0 +1,22 @@
+{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}}
+# This pod priority class is used for daemonsets to allow them to have priority
+# over pods that can be scheduled elsewhere. Without a priority class, it is
+# possible for a node to fill up with pods before the daemonset pods get to be
+# created for the node or get scheduled. Note that pods are not "daemonset"
+# pods - they are just pods created by the daemonset controller but they have
+# a specific affinity set during creation to the specific node each pod was
+# created to run on (daemonset controller takes care of that)
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+ name: omsagent
+ # Priority classes don't have labels :-)
+ annotations:
+ chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ component: oms-agent
+value: {{ .Values.omsagent.priority }}
+globalDefault: false
+description: "This is the daemonset priority class for omsagent"
+{{- end }}
diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml
index bd4e9baf3..5db5c2dab 100644
--- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml
@@ -33,10 +33,14 @@ rules:
verbs: ["get", "create", "patch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
+#arc k8s extension model grants access as part of the extension msi
+#remove this explicit permission once the extension available in public preview
+{{- if (empty .Values.Azure.Extension.Name) }}
- apiGroups: [""]
resources: ["secrets"]
resourceNames: ["container-insights-clusteridentityrequest-token"]
verbs: ["get"]
+{{- end }}
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml
index debd66b0b..d19dc0c47 100644
--- a/charts/azuremonitor-containers/values.yaml
+++ b/charts/azuremonitor-containers/values.yaml
@@ -4,11 +4,14 @@
## Microsoft OMS Agent image for kubernetes cluster monitoring
## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod
-## Values of ResourceId and Region under Azure->Cluster being populated by Azure Arc K8s RP during the installation of the extension
+## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension
Azure:
Cluster:
Region:
ResourceId:
+ Extension:
+ Name: ""
+ ResourceId: ""
omsagent:
image:
repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
@@ -17,6 +20,26 @@ omsagent:
pullPolicy: IfNotPresent
dockerProviderVersion: "12.0.0-0"
agentVersion: "1.10.0.1"
+ rsimage:
+ repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
+ tag: "fluentd4a"
+ pullPolicy: IfNotPresent
+ dockerProviderVersion: "12.0.0-0"
+ agentVersion: "1.10.0.1"
+
+ # The priority used by the omsagent priority class for the daemonset pods
+ # Note that this is not execution piority - it is scheduling priority, as
+ # in getting scheduled to the node. This needs to be greater than 0 such
+ # that the daemonset pods, which can not schedule onto different nodes as
+ # they are defined to run on specific nodes, are not accidentally frozen
+ # out of a node due to other pods showing up earlier in scheduling.
+ # (DaemonSet pods by definition only are created once the node exists for
+ # them to be created for and thus it is possible to have "normal" pods
+ # already in line to run on the node before the DeamonSet controller got a
+ # chance to build pod for the node and give it to the scheduler)
+ # Should be some number greater than default (0)
+ priority: 10
+
## To get your workspace id and key do the following
## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux.
@@ -58,7 +81,7 @@ omsagent:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- labelSelector:
- matchExpressions:
+ matchExpressions:
- key: kubernetes.io/os
operator: In
values:
@@ -67,10 +90,10 @@ omsagent:
operator: NotIn
values:
- virtual-kubelet
- - key: kubernetes.io/arch
+ - key: kubernetes.io/arch
operator: In
values:
- - amd64
+ - amd64
nodeSelectorTerms:
- labelSelector:
matchExpressions:
@@ -82,10 +105,10 @@ omsagent:
operator: NotIn
values:
- virtual-kubelet
- - key: beta.kubernetes.io/arch
+ - key: beta.kubernetes.io/arch
operator: In
values:
- - amd64
+ - amd64
deployment:
affinity:
nodeAffinity:
@@ -114,10 +137,10 @@ omsagent:
operator: NotIn
values:
- master
- - key: kubernetes.io/arch
+ - key: kubernetes.io/arch
operator: In
values:
- - amd64
+ - amd64
nodeSelectorTerms:
- labelSelector:
matchExpressions:
@@ -133,10 +156,10 @@ omsagent:
operator: NotIn
values:
- master
- - key: beta.kubernetes.io/arch
+ - key: beta.kubernetes.io/arch
operator: In
values:
- - amd64
+ - amd64
## Configure resource requests and limits
## ref: http://kubernetes.io/docs/user-guide/compute-resources/
##
diff --git a/charts/index.yaml b/charts/index.yaml
new file mode 100644
index 000000000..5add55f5b
--- /dev/null
+++ b/charts/index.yaml
@@ -0,0 +1,91 @@
+apiVersion: v1
+entries:
+ azuremonitor-containers:
+ - apiVersion: v1
+ appVersion: 7.0.0-1
+ created: "2021-02-24T16:01:29.6943046-08:00"
+ description: Helm chart for deploying Azure Monitor container monitoring agent
+ in Kubernetes
+ digest: b1d6c8558ddeeccca71954eee2adddef832f68b6cff07cafe6684a05c978241f
+ home: https://docs.microsoft.com/en-us/azure/monitoring/monitoring-container-health
+ icon: https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/img/azuremonitor-containers.svg
+ keywords:
+ - monitoring
+ - azuremonitor
+ - azure
+ - oms
+ - containerinsights
+ - metric
+ - event
+ - logs
+ - containerhealth
+ - kubernetesmonitoring
+ - acs-engine
+ - aks-engine
+ - azurestack
+ - openshift v4
+ - azure redhat openshift v4
+ - on-prem kubernetes monitoring
+ - containerlogs
+ - containerhealth
+ - containermonitoring
+ - hybrid kubernetes monitoring
+ - kubernetes
+ - kuberneteshealth
+ kubeVersion: ^1.10.0-0
+ maintainers:
+ - email: visnara@microsoft.com
+ name: vishiy
+ - email: gangams@microsoft.com
+ name: ganga1980
+ name: azuremonitor-containers
+ sources:
+ - https://github.com/microsoft/Docker-Provider/tree/ci_prod
+ urls:
+ - https://raw.githubusercontent.com/microsoft/Docker-Provider/grwehner/oneagent/charts/azuremonitor-containers-2.8.0.tgz
+ version: 2.8.0
+ azuremonitor-containers-highscale-private-preview:
+ - apiVersion: v1
+ appVersion: 7.0.0-1
+ created: "2021-02-24T16:01:29.7192586-08:00"
+ description: Helm chart for deploying Azure Monitor container monitoring agent
+ in Kubernetes
+ digest: 6f216a4d5280aed01650eea6979d4cfd143f55eb24171e6059a69316484d5ad2
+ home: https://docs.microsoft.com/en-us/azure/monitoring/monitoring-container-health
+ icon: https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/img/azuremonitor-containers.svg
+ keywords:
+ - monitoring
+ - azuremonitor
+ - azure
+ - oms
+ - containerinsights
+ - metric
+ - event
+ - logs
+ - containerhealth
+ - kubernetesmonitoring
+ - acs-engine
+ - aks-engine
+ - azurestack
+ - openshift v4
+ - azure redhat openshift v4
+ - on-prem kubernetes monitoring
+ - containerlogs
+ - containerhealth
+ - containermonitoring
+ - hybrid kubernetes monitoring
+ - kubernetes
+ - kuberneteshealth
+ kubeVersion: ^1.10.0-0
+ maintainers:
+ - email: visnara@microsoft.com
+ name: vishiy
+ - email: gangams@microsoft.com
+ name: ganga1980
+ name: azuremonitor-containers-highscale-private-preview
+ sources:
+ - https://github.com/microsoft/Docker-Provider/tree/ci_prod
+ urls:
+ - https://raw.githubusercontent.com/microsoft/Docker-Provider/grwehner/oneagent/charts/azuremonitor-containers-highscale-private-preview-2.8.0.tgz
+ version: 2.8.0
+generated: "2021-02-24T16:01:29.6683989-08:00"
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index c4067f25e..d51392d82 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -451,6 +451,7 @@ service cron start
#check if agent onboarded successfully
/opt/microsoft/omsagent/bin/omsadmin.sh -l
+#/opt/microsoft/omsagent/bin/service_control stop
#get omsagent and docker-provider versions
dpkg -l | grep omsagent | awk '{print $2 " " $3}'
@@ -484,17 +485,19 @@ echo "current region: $currentregion"
isoneagentregion=false
#set isoneagentregion as true if matching region is found
-if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then
- for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do
- if [ "$rgn" == "$currentregion" ]; then
- isoneagentregion=true
- echo "current region is in oneagent regions..."
- break
- fi
- done
-else
- echo "current region is not in oneagent regions..."
-fi
+#if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then
+# for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do
+# if [ "$rgn" == "$currentregion" ]; then
+# isoneagentregion=true
+# echo "current region is in oneagent regions..."
+# break
+# fi
+# done
+#else
+# echo "current region is not in oneagent regions..."
+#fi
+
+isoneagentregion=true
if [ "$isoneagentregion" = true ]; then
#if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route
@@ -511,7 +514,7 @@ fi
#start oneagent
-if [ ! -e "/etc/config/kube.conf" ]; then
+#if [ ! -e "/etc/config/kube.conf" ]; then
if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then
echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE"
echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE"
@@ -544,12 +547,12 @@ if [ ! -e "/etc/config/kube.conf" ]; then
dpkg -l | grep mdsd | awk '{print $2 " " $3}'
echo "starting mdsd ..."
- mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos &
+ mdsd -l -a -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos &
touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2
fi
fi
-fi
+#fi
echo "************end oneagent log routing checks************"
#telegraf & fluentbit requirements
@@ -616,7 +619,7 @@ echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc
#start telegraf
-/opt/telegraf --config $telegrafConfFile &
+HOST_PROC=/proc /opt/telegraf --config $telegrafConfFile &
/opt/telegraf --version
dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}'
@@ -630,9 +633,16 @@ service rsyslog stop
echo "getting rsyslog status..."
service rsyslog status
+
+
+# start the new fluentd
+fluentd -c /opt/fluent/fluentd4.conf -o /var/fluent/fluentd4.log &
+
+
+
shutdown() {
/opt/microsoft/omsagent/bin/service_control stop
- }
+ }
trap "shutdown" SIGTERM
diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml
index 76d2104fc..e667ea8c3 100644
--- a/kubernetes/linux/mdsd.xml
+++ b/kubernetes/linux/mdsd.xml
@@ -47,33 +47,55 @@
Each column has a name, an augmented JSON source type, and a target MDS type.
-->
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
+
-
+
+
+
+ ]]>
+
+
+
+
-
-
- ]]>
+ ]]>
-
-
+
+
\ No newline at end of file
diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh
index fe6c0565a..8fd6eead4 100644
--- a/kubernetes/linux/setup.sh
+++ b/kubernetes/linux/setup.sh
@@ -32,7 +32,7 @@ mv $TMPDIR/omsbundle* $TMPDIR/omsbundle
#/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb
#install oneagent - Official bits (10/18)
-wget https://github.com/microsoft/Docker-Provider/releases/download/10182020-oneagent/azure-mdsd_1.5.126-build.master.99_x86_64.deb
+wget https://github.com/microsoft/Docker-Provider/releases/download/1.6.0.168-oneagent/azure-mdsd_1.6.0-build.master.168_x86_64.deb
/usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb
cp -f $TMPDIR/mdsd.xml /etc/mdsd.d
cp -f $TMPDIR/envmdsd /etc/mdsd.d
@@ -79,3 +79,18 @@ rm -f $TMPDIR/docker-cimprov*.sh
rm -f $TMPDIR/azure-mdsd*.deb
rm -f $TMPDIR/mdsd.xml
rm -f $TMPDIR/envmdsd
+
+
+
+# install fluentd
+apt-get install nano less curl sudo apt-utils rubygems ruby-dev gcc make wget ucf locate -y
+gem install fluentd --no-doc
+fluentd --setup ./fluent
+
+gem install gyoku iso8601 --no-doc
+
+# copy all plugins to the new fluentd
+# rmdir fluent/plugin
+# ln -s /opt/microsoft/omsagent/plugin/ /opt/fluent
+# mkdir /etc/fluent
+# ln -s /opt/microsoft/omsagent/plugin/ /etc/fluent/.
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 67bd9cdde..da4746dba 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -59,14 +59,6 @@ data:
chunk_size_limit 4m
- #Kubernetes pod inventory
-
- type kubepodinventory
- tag oms.containerinsights.KubePodInventory
- run_interval 60
- log_level debug
-
-
#Kubernetes Persistent Volume inventory
type kubepvinventory
@@ -140,21 +132,6 @@ data:
type filter_health_model_builder
-
- type out_oms
- log_level debug
- num_threads 2
- buffer_chunk_limit 4m
- buffer_type file
- buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer
- buffer_queue_limit 20
- buffer_queue_full_action drop_oldest_chunk
- flush_interval 20s
- retry_limit 10
- retry_wait 5s
- max_retry_wait 5m
-
-
type out_oms
log_level debug
@@ -244,7 +221,7 @@ data:
max_retry_wait 5m
-
+
type out_mdm
log_level debug
num_threads 5
diff --git a/scripts/preview/mdm-alerts/private-preview-onboard.sh b/scripts/preview/mdm-alerts/private-preview-onboard.sh
index 6515c6753..9f5bc4ca4 100644
--- a/scripts/preview/mdm-alerts/private-preview-onboard.sh
+++ b/scripts/preview/mdm-alerts/private-preview-onboard.sh
@@ -86,7 +86,7 @@ helm repo update
echo "uninstalling existing release if any for azmon-containers-ci-mdm-alert-release"
helm uninstall azmon-containers-ci-mdm-alert-release
-helm upgrade --install azmon-containers-ci-mdm-alert-release --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion azmon-preview-mdm-alert/azuremonitor-containers --kube-context $clusterName
+helm upgrade --install azmon-containers-ci-mdm-alert-release --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion,omsagent.logsettings.tailbufmaxsizemegabytes="20" azmon-preview-mdm-alert/azuremonitor-containers --kube-context $clusterName
echo "chart installation completed."
echo "setting the subscription id of the cluster: ${clusterSubscriptionId}"
diff --git a/source/plugins/ruby-fluentd4/ApplicationInsightsUtility.rb b/source/plugins/ruby-fluentd4/ApplicationInsightsUtility.rb
new file mode 100644
index 000000000..b118cc646
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/ApplicationInsightsUtility.rb
@@ -0,0 +1,307 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class ApplicationInsightsUtility
+ require_relative "lib/application_insights"
+ require_relative "omslog"
+ require_relative "DockerApiClient"
+ require_relative "oms_common"
+ require_relative "proxy_utils"
+ require "yajl/json_gem"
+ require "base64"
+
+ @@HeartBeat = "HeartBeatEvent"
+ @@Exception = "ExceptionEvent"
+ @@AcsClusterType = "ACS"
+ @@AksClusterType = "AKS"
+ @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf"
+ @@EnvAcsResourceName = "ACS_RESOURCE_NAME"
+ @@EnvAksRegion = "AKS_REGION"
+ @@EnvAgentVersion = "AGENT_VERSION"
+ @@EnvApplicationInsightsKey = "APPLICATIONINSIGHTS_AUTH"
+ @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT"
+ @@EnvControllerType = "CONTROLLER_TYPE"
+ @@EnvContainerRuntime = "CONTAINER_RUNTIME"
+
+ @@CustomProperties = {}
+ @@Tc = nil
+ @@hostName = (OMS::Common.get_hostname)
+ @@proxy = (ProxyUtils.getProxyConfiguration)
+
+ def initialize
+ end
+
+ class << self
+ #Set default properties for telemetry event
+ def initializeUtility()
+ begin
+ resourceInfo = ENV["AKS_RESOURCE_ID"]
+ if resourceInfo.nil? || resourceInfo.empty?
+ @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName]
+ @@CustomProperties["ClusterType"] = @@AcsClusterType
+ @@CustomProperties["SubscriptionID"] = ""
+ @@CustomProperties["ResourceGroupName"] = ""
+ @@CustomProperties["ClusterName"] = ""
+ @@CustomProperties["Region"] = ""
+ else
+ @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo
+ begin
+ splitStrings = resourceInfo.split("/")
+ subscriptionId = splitStrings[2]
+ resourceGroupName = splitStrings[4]
+ clusterName = splitStrings[8]
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}")
+ end
+ @@CustomProperties["ClusterType"] = @@AksClusterType
+ @@CustomProperties["SubscriptionID"] = subscriptionId
+ @@CustomProperties["ResourceGroupName"] = resourceGroupName
+ @@CustomProperties["ClusterName"] = clusterName
+ @@CustomProperties["Region"] = ENV[@@EnvAksRegion]
+ end
+
+ #Commenting it for now from initilize method, we need to pivot all telemetry off of kubenode docker version
+ #getDockerInfo()
+ @@CustomProperties["WorkspaceID"] = getWorkspaceId
+ @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion]
+ @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType]
+ @@CustomProperties["Computer"] = @@hostName
+ encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey]
+ appInsightsEndpoint = ENV[@@EnvApplicationInsightsEndpoint]
+ @@CustomProperties["WorkspaceCloud"] = getWorkspaceCloud
+ if !@@proxy.nil? && !@@proxy.empty?
+ $log.info("proxy configured")
+ @@CustomProperties["IsProxyConfigured"] = "true"
+ isProxyConfigured = true
+ else
+ @@CustomProperties["IsProxyConfigured"] = "false"
+ isProxyConfigured = false
+ $log.info("proxy is not configured")
+ end
+
+ #Check if telemetry is turned off
+ telemetryOffSwitch = ENV["DISABLE_TELEMETRY"]
+ if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase
+ $log.warn("AppInsightsUtility: Telemetry is disabled")
+ @@Tc = ApplicationInsights::TelemetryClient.new
+ elsif !encodedAppInsightsKey.nil?
+ decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey)
+
+ #override ai endpoint if its available otherwise use default.
+ if appInsightsEndpoint && !appInsightsEndpoint.nil? && !appInsightsEndpoint.empty?
+ $log.info("AppInsightsUtility: Telemetry client uses overrided endpoint url : #{appInsightsEndpoint}")
+ #telemetrySynchronousSender = ApplicationInsights::Channel::SynchronousSender.new appInsightsEndpoint
+ #telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender)
+ #telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue
+ if !isProxyConfigured
+ sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint
+ else
+ $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured")
+ sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, @@proxy
+ end
+ queue = ApplicationInsights::Channel::AsynchronousQueue.new sender
+ channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue
+ @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, channel
+ else
+ if !isProxyConfigured
+ sender = ApplicationInsights::Channel::AsynchronousSender.new
+ else
+ $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured")
+ sender = ApplicationInsights::Channel::AsynchronousSender.new nil, @@proxy
+ end
+ queue = ApplicationInsights::Channel::AsynchronousQueue.new sender
+ channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue
+ @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, channel
+ end
+ # The below are default recommended values. If you change these, ensure you test telemetry flow fully
+
+ # flush telemetry if we have 10 or more telemetry items in our queue
+ #@@Tc.channel.queue.max_queue_length = 10
+
+ # send telemetry to the service in batches of 5
+ #@@Tc.channel.sender.send_buffer_size = 5
+
+ # the background worker thread will be active for 5 seconds before it shuts down. if
+ # during this time items are picked up from the queue, the timer is reset.
+ #@@Tc.channel.sender.send_time = 5
+
+ # the background worker thread will poll the queue every 0.5 seconds for new items
+ #@@Tc.channel.sender.send_interval = 0.5
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}")
+ end
+ end
+
+ def getContainerRuntimeInfo()
+ containerRuntime = ENV[@@EnvContainerRuntime]
+ if !containerRuntime.nil? && !containerRuntime.empty?
+ # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker
+ @@CustomProperties["DockerVersion"] = containerRuntime
+ if containerRuntime.casecmp("docker") == 0
+ dockerInfo = DockerApiClient.dockerInfo
+ if (!dockerInfo.nil? && !dockerInfo.empty?)
+ @@CustomProperties["DockerVersion"] = dockerInfo["Version"]
+ end
+ end
+ end
+ end
+
+ def sendHeartBeatEvent(pluginName)
+ begin
+ eventName = pluginName + @@HeartBeat
+ if !(@@Tc.nil?)
+ @@Tc.track_event eventName, :properties => @@CustomProperties
+ $log.info("AppInsights Heartbeat Telemetry put successfully into the queue")
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}")
+ end
+ end
+
+ def sendLastProcessedContainerInventoryCountMetric(pluginName, properties)
+ begin
+ if !(@@Tc.nil?)
+ @@Tc.track_metric "LastProcessedContainerInventoryCount", properties["ContainerCount"],
+ :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT,
+ :properties => @@CustomProperties
+ $log.info("AppInsights Container Count Telemetry sput successfully into the queue")
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}")
+ end
+ end
+
+ def sendCustomEvent(eventName, properties)
+ begin
+ if @@CustomProperties.empty? || @@CustomProperties.nil?
+ initializeUtility()
+ end
+ telemetryProps = {}
+ # add common dimensions
+ @@CustomProperties.each { |k, v| telemetryProps[k] = v }
+ # add passed-in dimensions if any
+ if (!properties.nil? && !properties.empty?)
+ properties.each { |k, v| telemetryProps[k] = v }
+ end
+ if !(@@Tc.nil?)
+ @@Tc.track_event eventName, :properties => telemetryProps
+ $log.info("AppInsights Custom Event #{eventName} sent successfully")
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}")
+ end
+ end
+
+ def sendExceptionTelemetry(errorStr, properties = nil)
+ begin
+ if @@CustomProperties.empty? || @@CustomProperties.nil?
+ initializeUtility()
+ elsif @@CustomProperties["DockerVersion"].nil?
+ getContainerRuntimeInfo()
+ end
+ telemetryProps = {}
+ # add common dimensions
+ @@CustomProperties.each { |k, v| telemetryProps[k] = v }
+ # add passed-in dimensions if any
+ if (!properties.nil? && !properties.empty?)
+ properties.each { |k, v| telemetryProps[k] = v }
+ end
+ if !(@@Tc.nil?)
+ @@Tc.track_exception errorStr, :properties => telemetryProps
+ $log.info("AppInsights Exception Telemetry put successfully into the queue")
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}")
+ end
+ end
+
+ #Method to send heartbeat and container inventory count
+ def sendTelemetry(pluginName, properties)
+ begin
+ if @@CustomProperties.empty? || @@CustomProperties.nil?
+ initializeUtility()
+ elsif @@CustomProperties["DockerVersion"].nil?
+ getContainerRuntimeInfo()
+ end
+ @@CustomProperties["Computer"] = properties["Computer"]
+ sendHeartBeatEvent(pluginName)
+ sendLastProcessedContainerInventoryCountMetric(pluginName, properties)
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}")
+ end
+ end
+
+ #Method to send metric. It will merge passed-in properties with common custom properties
+ def sendMetricTelemetry(metricName, metricValue, properties)
+ begin
+ if (metricName.empty? || metricName.nil?)
+ $log.warn("SendMetricTelemetry: metricName is missing")
+ return
+ end
+ if @@CustomProperties.empty? || @@CustomProperties.nil?
+ initializeUtility()
+ elsif @@CustomProperties["DockerVersion"].nil?
+ getContainerRuntimeInfo()
+ end
+ telemetryProps = {}
+ # add common dimensions
+ @@CustomProperties.each { |k, v| telemetryProps[k] = v }
+ # add passed-in dimensions if any
+ if (!properties.nil? && !properties.empty?)
+ properties.each { |k, v| telemetryProps[k] = v }
+ end
+ if !(@@Tc.nil?)
+ @@Tc.track_metric metricName, metricValue,
+ :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT,
+ :properties => telemetryProps
+ $log.info("AppInsights metric Telemetry #{metricName} put successfully into the queue")
+ end
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}")
+ end
+ end
+
+ def getWorkspaceId()
+ begin
+ adminConf = {}
+ confFile = File.open(@OmsAdminFilePath, "r")
+ confFile.each_line do |line|
+ splitStrings = line.split("=")
+ adminConf[splitStrings[0]] = splitStrings[1]
+ end
+ workspaceId = adminConf["WORKSPACE_ID"]
+ return workspaceId
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}")
+ end
+ end
+
+ def getWorkspaceCloud()
+ begin
+ adminConf = {}
+ confFile = File.open(@OmsAdminFilePath, "r")
+ confFile.each_line do |line|
+ splitStrings = line.split("=")
+ adminConf[splitStrings[0]] = splitStrings[1]
+ end
+ workspaceDomain = adminConf["URL_TLD"].strip
+ workspaceCloud = "AzureCloud"
+ if workspaceDomain.casecmp("opinsights.azure.com") == 0
+ workspaceCloud = "AzureCloud"
+ elsif workspaceDomain.casecmp("opinsights.azure.cn") == 0
+ workspaceCloud = "AzureChinaCloud"
+ elsif workspaceDomain.casecmp("opinsights.azure.us") == 0
+ workspaceCloud = "AzureUSGovernment"
+ elsif workspaceDomain.casecmp("opinsights.azure.de") == 0
+ workspaceCloud = "AzureGermanCloud"
+ else
+ workspaceCloud = "Unknown"
+ end
+ return workspaceCloud
+ rescue => errorStr
+ $log.warn("Exception in AppInsightsUtility: getWorkspaceCloud - error: #{errorStr}")
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/CustomMetricsUtils.rb b/source/plugins/ruby-fluentd4/CustomMetricsUtils.rb
new file mode 100644
index 000000000..220313e6b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/CustomMetricsUtils.rb
@@ -0,0 +1,20 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class CustomMetricsUtils
+ def initialize
+ end
+
+ class << self
+ def check_custom_metrics_availability
+ aks_region = ENV['AKS_REGION']
+ aks_resource_id = ENV['AKS_RESOURCE_ID']
+ aks_cloud_environment = ENV['CLOUD_ENVIRONMENT']
+ if aks_region.to_s.empty? || aks_resource_id.to_s.empty?
+ return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set
+ end
+
+ return aks_cloud_environment.to_s.downcase == 'public'
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/DockerApiClient.rb b/source/plugins/ruby-fluentd4/DockerApiClient.rb
new file mode 100644
index 000000000..53dd1f39f
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/DockerApiClient.rb
@@ -0,0 +1,208 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class DockerApiClient
+ require "socket"
+ require "yajl/json_gem"
+ require "timeout"
+ require_relative "omslog"
+ require_relative "DockerApiRestHelper"
+ require_relative "ApplicationInsightsUtility"
+
+ @@SocketPath = "/var/run/host/docker.sock"
+ @@ChunkSize = 4096
+ @@TimeoutInSeconds = 5
+ @@PluginName = "ContainerInventory"
+
+ def initialize
+ end
+
+ class << self
+ # Make docker socket call for requests
+ def getResponse(request, isMultiJson, isVersion)
+ begin
+ socket = UNIXSocket.new(@@SocketPath)
+ dockerResponse = ""
+ isTimeOut = false
+ socket.write(request)
+ # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket.
+ loop do
+ begin
+ responseChunk = ""
+ Timeout.timeout(@@TimeoutInSeconds) do
+ responseChunk = socket.recv(@@ChunkSize)
+ end
+ dockerResponse += responseChunk
+ rescue Timeout::Error
+ $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}")
+ isTimeOut = true
+ break
+ end
+ break if (isVersion) ? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n")
+ end
+ return (isTimeOut) ? nil : parseResponse(dockerResponse, isMultiJson)
+ rescue => errorStr
+ $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}")
+ # Adding this check to avoid an infinite loop for the docker info call in exception telemetry
+ if !request.include? "GET /version "
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return nil
+ ensure
+ if !socket.nil?
+ socket.close
+ end
+ end
+ end
+
+ def parseResponse(dockerResponse, isMultiJson)
+ # Doing this because the response is in the raw format and includes headers.
+ # Need to do a regex match to extract the json part of the response - Anything between [{}] in response
+ parsedJsonResponse = nil
+ begin
+ jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/]
+ rescue => errorStr
+ $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}")
+ end
+ begin
+ if jsonResponse != nil
+ parsedJsonResponse = JSON.parse(jsonResponse)
+ end
+ rescue => errorStr
+ $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return parsedJsonResponse
+ end
+
+ def getDockerHostName()
+ dockerHostName = ""
+ request = DockerApiRestHelper.restDockerInfo
+ response = getResponse(request, false, false)
+ if (response != nil)
+ dockerHostName = response["Name"]
+ end
+ return dockerHostName
+ end
+
+ def listContainers()
+ ids = []
+ begin
+ request = DockerApiRestHelper.restDockerPs
+ containers = getResponse(request, true, false)
+ if !containers.nil? && !containers.empty?
+ containers.each do |container|
+ labels = (!container["Labels"].nil?) ? container["Labels"] : container["labels"]
+ if !labels.nil?
+ labelKeys = labels.keys
+ dockerTypeLabel = labelKeys.find { |k| "io.kubernetes.docker.type".downcase == k.downcase }
+ if !dockerTypeLabel.nil?
+ dockerTypeLabelValue = labels[dockerTypeLabel]
+ # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers
+ if !(dockerTypeLabelValue.downcase == "podsandbox".downcase)
+ # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that
+ # are created in the pods for ContainerInventory
+ keyValue = labelKeys.find { |k| "io.kubernetes.pod.uid".downcase == k.downcase }
+ if !labels[keyValue].nil?
+ ids.push(container["Id"])
+ end
+ end
+ end
+ end
+ end
+ end
+ rescue => errorStr
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return ids
+ end
+
+ # This method splits the tag value into an array - repository, image, tag, repodigest-imageid
+ def getImageRepositoryImageTag(tagValue, digestValue)
+ result = ["", "", "", ""]
+ atLocation = nil
+ begin
+ if !digestValue.empty?
+ # digest is of the format - repo@sha256:imageid
+ atLocation = digestValue.index("@")
+ if !atLocation.nil?
+ result[3] = digestValue[(atLocation + 1)..-1]
+ end
+ end
+
+ if !tagValue.empty?
+ # Find delimiters in the string of format repository/image:imagetag
+ slashLocation = tagValue.index("/")
+ colonLocation = tagValue.index(":")
+ if !colonLocation.nil?
+ if slashLocation.nil?
+ # image:imagetag
+ result[1] = tagValue[0..(colonLocation - 1)]
+ else
+ # repository/image:imagetag
+ result[0] = tagValue[0..(slashLocation - 1)]
+ result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)]
+ end
+ result[2] = tagValue[(colonLocation + 1)..-1]
+ end
+ elsif !digestValue.empty?
+ # Getting repo information from repodigests when repotags is empty
+ if !atLocation.nil?
+ result[0] = digestValue[0..(atLocation - 1)]
+ end
+ end
+ rescue => errorStr
+ $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}")
+ end
+ return result
+ end
+
+ # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag
+ def getImageIdMap()
+ result = nil
+ begin
+ request = DockerApiRestHelper.restDockerImages
+ images = getResponse(request, true, false)
+ if !images.nil? && !images.empty?
+ result = {}
+ images.each do |image|
+ tagValue = ""
+ tags = image["RepoTags"]
+ if !tags.nil? && tags.kind_of?(Array) && tags.length > 0
+ tagValue = tags[0]
+ end
+ digestValue = ""
+ digests = image["RepoDigests"]
+ if !digests.nil? && digests.kind_of?(Array) && digests.length > 0
+ digestValue = digests[0]
+ end
+ idValue = image["Id"]
+ if !idValue.nil?
+ result[idValue] = getImageRepositoryImageTag(tagValue, digestValue)
+ end
+ end
+ end
+ rescue => errorStr
+ $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}")
+ end
+ return result
+ end
+
+ def dockerInspectContainer(id)
+ request = DockerApiRestHelper.restDockerInspect(id)
+ return getResponse(request, false, false)
+ end
+
+ # This method returns docker version and docker api version for telemetry
+ def dockerInfo()
+ request = DockerApiRestHelper.restDockerVersion
+ response = getResponse(request, false, true)
+ dockerInfo = {}
+ if (response != nil)
+ dockerInfo["Version"] = response["Version"]
+ dockerInfo["ApiVersion"] = response["ApiVersion"]
+ end
+ return dockerInfo
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/DockerApiRestHelper.rb b/source/plugins/ruby-fluentd4/DockerApiRestHelper.rb
new file mode 100644
index 000000000..fbb08d6ce
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/DockerApiRestHelper.rb
@@ -0,0 +1,55 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class DockerApiRestHelper
+ def initialize
+ end
+
+ class << self
+ # Create the REST request to list images
+ # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#list-images
+ # returns Request in string format
+ def restDockerImages()
+ begin
+ return "GET /images/json?all=0 HTTP/1.1\r\nHost: localhost\r\n\r\n"
+ end
+ end
+
+ # Create the REST request to list containers
+ # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#list-containers
+ # returns Request in string format
+ def restDockerPs()
+ begin
+ return "GET /containers/json?all=1 HTTP/1.1\r\nHost: localhost\r\n\r\n"
+ end
+ end
+
+ # Create the REST request to inspect a container
+ # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#inspect-a-container
+ # parameter - ID of the container to be inspected
+ # returns Request in string format
+ def restDockerInspect(id)
+ begin
+ return "GET /containers/" + id + "/json HTTP/1.1\r\nHost: localhost\r\n\r\n"
+ end
+ end
+
+ # Create the REST request to get docker info
+ # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#get-container-stats-based-on-resource-usage
+ # returns Request in string format
+ def restDockerInfo()
+ begin
+ return "GET /info HTTP/1.1\r\nHost: localhost\r\n\r\n"
+ end
+ end
+
+ # Create the REST request to get docker info
+ # https://docs.docker.com/engine/api/v1.21/#21-containers
+ # returns Request in string format
+ def restDockerVersion()
+ begin
+ return "GET /version HTTP/1.1\r\nHost: localhost\r\n\r\n"
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/KubernetesApiClient.rb b/source/plugins/ruby-fluentd4/KubernetesApiClient.rb
new file mode 100644
index 000000000..aca2142a0
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/KubernetesApiClient.rb
@@ -0,0 +1,818 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class KubernetesApiClient
+ require "yajl/json_gem"
+ require "logger"
+ require "net/http"
+ require "net/https"
+ require "uri"
+ require "time"
+
+ require_relative "oms_common"
+ require_relative "constants"
+
+ @@ApiVersion = "v1"
+ @@ApiVersionApps = "v1"
+ @@ApiGroupApps = "apps"
+ @@ApiGroupHPA = "autoscaling"
+ @@ApiVersionHPA = "v1"
+ @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+ @@ClusterName = nil
+ @@ClusterId = nil
+ @@IsNodeMaster = nil
+ @@IsAROV3Cluster = nil
+ #@@IsValidRunningNode = nil
+ #@@IsLinuxCluster = nil
+ @@KubeSystemNamespace = "kube-system"
+ @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt"
+ @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
+ @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+ @@TokenStr = nil
+ @@NodeMetrics = Hash.new
+ @@WinNodeArray = []
+
+ def initialize
+ end
+
+ class << self
+ def getKubeResourceInfo(resource, api_group: nil)
+ headers = {}
+ response = nil
+ @Log.info "Getting Kube resource: #{resource}"
+ begin
+ resourceUri = getResourceUri(resource, api_group)
+ if !resourceUri.nil?
+ uri = URI.parse(resourceUri)
+ if !File.exist?(@@CaFile)
+ raise "#{@@CaFile} doesnt exist"
+ else
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http|
+ kubeApiRequest = Net::HTTP::Get.new(uri.request_uri)
+ kubeApiRequest["Authorization"] = "Bearer " + getTokenStr
+ @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+ response = http.request(kubeApiRequest)
+ @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}")
+ end
+ if (!response.nil?)
+ if (!response.body.nil? && response.body.empty?)
+ @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}")
+ end
+ end
+ return response
+ end
+
+ def getTokenStr
+ return @@TokenStr if !@@TokenStr.nil?
+ begin
+ if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName)
+ @@TokenStr = File.read(@@TokenFileName).strip
+ return @@TokenStr
+ else
+ @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}")
+ return nil
+ end
+ end
+ end
+
+ def getClusterRegion
+ if ENV["AKS_REGION"]
+ return ENV["AKS_REGION"]
+ else
+ @Log.warn ("Kubernetes environment variable not set AKS_REGION. Unable to get cluster region.")
+ return nil
+ end
+ end
+
+ def getResourceUri(resource, api_group)
+ begin
+ if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"]
+ if api_group.nil?
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource
+ elsif api_group == @@ApiGroupApps
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/apps/" + @@ApiVersionApps + "/" + resource
+ elsif api_group == @@ApiGroupHPA
+ return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + @@ApiGroupHPA + "/" + @@ApiVersionHPA + "/" + resource
+ end
+ else
+ @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri")
+ return nil
+ end
+ end
+ end
+
+ def getClusterName
+ return @@ClusterName if !@@ClusterName.nil?
+ @@ClusterName = "None"
+ begin
+ #try getting resource ID for aks
+ cluster = ENV["AKS_RESOURCE_ID"]
+ if cluster && !cluster.nil? && !cluster.empty?
+ @@ClusterName = cluster.split("/").last
+ else
+ cluster = ENV["ACS_RESOURCE_NAME"]
+ if cluster && !cluster.nil? && !cluster.empty?
+ @@ClusterName = cluster
+ else
+ kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods"
+ @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body)
+ @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podInfo["items"].each do |items|
+ if items["metadata"]["name"].include? "kube-controller-manager"
+ items["spec"]["containers"][0]["command"].each do |command|
+ if command.include? "--cluster-name"
+ @@ClusterName = command.split("=")[1]
+ end
+ end
+ end
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("getClusterName failed: #{error}")
+ end
+ return @@ClusterName
+ end
+
+ def getClusterId
+ return @@ClusterId if !@@ClusterId.nil?
+ #By default initialize ClusterId to ClusterName.
+ # In ACS/On-prem, we need to figure out how we can generate ClusterId
+ # Dilipr: Spoof the subid by generating md5 hash of cluster name, and taking some constant parts of it.
+ # e.g. md5 digest is 128 bits = 32 character in hex. Get first 16 and get a guid, and the next 16 to get resource id
+ @@ClusterId = getClusterName
+ begin
+ cluster = ENV["AKS_RESOURCE_ID"]
+ if cluster && !cluster.nil? && !cluster.empty?
+ @@ClusterId = cluster
+ end
+ rescue => error
+ @Log.warn("getClusterId failed: #{error}")
+ end
+ return @@ClusterId
+ end
+
+ def isAROV3Cluster
+ return @@IsAROV3Cluster if !@@IsAROV3Cluster.nil?
+ @@IsAROV3Cluster = false
+ begin
+ cluster = getClusterId
+ if !cluster.nil? && !cluster.empty? && cluster.downcase.include?("/microsoft.containerservice/openshiftmanagedclusters")
+ @@IsAROV3Cluster = true
+ end
+ rescue => error
+ @Log.warn("KubernetesApiClient::IsAROV3Cluster : IsAROV3Cluster failed #{error}")
+ end
+ return @@IsAROV3Cluster
+ end
+
+ def isAROv3MasterOrInfraPod(nodeName)
+ return isAROV3Cluster() && (!nodeName.nil? && (nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-")))
+ end
+
+ def isNodeMaster
+ return @@IsNodeMaster if !@@IsNodeMaster.nil?
+ @@IsNodeMaster = false
+ begin
+ @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+ allNodesInfo = JSON.parse(getKubeResourceInfo("nodes").body)
+ @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+ if !allNodesInfo.nil? && !allNodesInfo.empty?
+ thisNodeName = OMS::Common.get_hostname
+ allNodesInfo["items"].each do |item|
+ if item["metadata"]["name"].casecmp(thisNodeName) == 0
+ if item["metadata"]["labels"]["kubernetes.io/role"].to_s.include?("master") || item["metadata"]["labels"]["role"].to_s.include?("master")
+ @@IsNodeMaster = true
+ end
+ break
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}")
+ end
+
+ return @@IsNodeMaster
+ end
+
+ def getNodesResourceUri(nodesResourceUri)
+ begin
+ # For ARO v3 cluster, filter out all other node roles other than compute
+ if isAROV3Cluster()
+ if !nodesResourceUri.nil? && !nodesResourceUri.index("?").nil?
+ nodesResourceUri = nodesResourceUri + "&labelSelector=node-role.kubernetes.io%2Fcompute%3Dtrue"
+ else
+ nodesResourceUri = nodesResourceUri + "labelSelector=node-role.kubernetes.io%2Fcompute%3Dtrue"
+ end
+ end
+ rescue => error
+ @Log.warn("getNodesResourceUri failed: #{error}")
+ end
+ return nodesResourceUri
+ end
+
+ #def isValidRunningNode
+ # return @@IsValidRunningNode if !@@IsValidRunningNode.nil?
+ # @@IsValidRunningNode = false
+ # begin
+ # thisNodeName = OMS::Common.get_hostname
+ # if isLinuxCluster
+ # # Run on agent node [0]
+ # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0'
+ # else
+ # # Run on master node [0]
+ # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0'
+ # end
+ # rescue => error
+ # @Log.warn("Checking Node Type failed: #{error}")
+ # end
+ # if(@@IsValidRunningNode == true)
+ # @Log.info("Electing current node to talk to k8 api")
+ # else
+ # @Log.info("Not Electing current node to talk to k8 api")
+ # end
+ # return @@IsValidRunningNode
+ #end
+
+ #def isLinuxCluster
+ # return @@IsLinuxCluster if !@@IsLinuxCluster.nil?
+ # @@IsLinuxCluster = true
+ # begin
+ # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+ # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body)
+ # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+ # if !allNodesInfo.nil? && !allNodesInfo.empty?
+ # allNodesInfo['items'].each do |item|
+ # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0)
+ # @@IsLinuxCluster = false
+ # break
+ # end
+ # end
+ # end
+ # rescue => error
+ # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}")
+ # end
+ # return @@IsLinuxCluster
+ #end
+
+ # returns an arry of pods (json)
+ def getPods(namespace)
+ pods = []
+ begin
+ kubesystemResourceUri = "namespaces/" + namespace + "/pods"
+ podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body)
+ podInfo["items"].each do |items|
+ pods.push items
+ end
+ rescue => error
+ @Log.warn("List pods request failed: #{error}")
+ end
+ return pods
+ end
+
+ # returns a hash of windows node names and their internal IPs
+ def getWindowsNodes
+ winNodes = []
+ begin
+ # get only windows nodes
+ resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows")
+ nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body)
+ @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api"
+ # Resetting the windows node cache
+ @@WinNodeArray.clear
+ if (!nodeInventory.empty?)
+ nodeInventory["items"].each do |item|
+ # check for windows operating system in node metadata
+ winNode = {}
+ nodeStatus = item["status"]
+ nodeMetadata = item["metadata"]
+ if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
+ operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
+ if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
+ # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data
+ # to get images and image tags for containers in windows nodes
+ if !nodeMetadata.nil? && !nodeMetadata["name"].nil?
+ @@WinNodeArray.push(nodeMetadata["name"])
+ end
+ nodeStatusAddresses = nodeStatus["addresses"]
+ if !nodeStatusAddresses.nil?
+ nodeStatusAddresses.each do |address|
+ winNode[address["type"]] = address["address"]
+ end
+ winNodes.push(winNode)
+ end
+ end
+ end
+ end
+ end
+ return winNodes
+ rescue => error
+ @Log.warn("Error in get windows nodes: #{error}")
+ return nil
+ end
+ end
+
+ def getWindowsNodesArray
+ return @@WinNodeArray
+ end
+
+ def getContainerIDs(namespace)
+ containers = Hash.new
+ begin
+ kubesystemResourceUri = "namespaces/" + namespace + "/pods"
+ @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body)
+ @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podInfo["items"].each do |item|
+ if (!item["status"].nil? && !item["status"].empty? && !item["status"]["containerStatuses"].nil? && !item["status"]["containerStatuses"].empty?)
+ item["status"]["containerStatuses"].each do |cntr|
+ containers[cntr["containerID"]] = "kube-system"
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}")
+ end
+ return containers
+ end
+
+ def getContainerLogs(namespace, pod, container, showTimeStamp)
+ containerLogs = ""
+ begin
+ kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container
+ if showTimeStamp
+ kubesystemResourceUri += "×tamps=true"
+ end
+ @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}")
+ containerLogs = getKubeResourceInfo(kubesystemResourceUri).body
+ @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}")
+ rescue => error
+ @Log.warn("Pod logs request failed: #{error}")
+ end
+ return containerLogs
+ end
+
+ def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp)
+ containerLogs = ""
+ begin
+ kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since
+ kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date
+
+ if showTimeStamp
+ kubesystemResourceUri += "×tamps=true"
+ end
+ @Log.info("calling #{kubesystemResourceUri}")
+ @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}")
+ containerLogs = getKubeResourceInfo(kubesystemResourceUri).body
+ @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}")
+ rescue => error
+ @Log.warn("Pod logs request failed: #{error}")
+ end
+ return containerLogs
+ end
+
+ def getPodUid(podNameSpace, podMetadata)
+ podUid = nil
+ begin
+ if podNameSpace.eql?("kube-system") && !podMetadata.key?("ownerReferences")
+ # The above case seems to be the only case where you have horizontal scaling of pods
+ # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash
+ # instead of the actual poduid. Since this uid is not being surface into the UX
+ # its ok to use this.
+ # Use kubernetes.io/config.hash to be able to correlate with cadvisor data
+ if podMetadata["annotations"].nil?
+ return nil
+ else
+ podUid = podMetadata["annotations"]["kubernetes.io/config.hash"]
+ end
+ else
+ podUid = podMetadata["uid"]
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getPodUid:Failed to get poduid: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return podUid
+ end
+
+ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItems = []
+ begin
+ clusterId = getClusterId
+ podNameSpace = pod["metadata"]["namespace"]
+ podUid = getPodUid(podNameSpace, pod["metadata"])
+ if podUid.nil?
+ return metricItems
+ end
+
+ nodeName = ""
+ #for unscheduled (non-started) pods nodeName does NOT exist
+ if !pod["spec"]["nodeName"].nil?
+ nodeName = pod["spec"]["nodeName"]
+ end
+ # For ARO, skip the pods scheduled on to master or infra nodes to ingest
+ if isAROv3MasterOrInfraPod(nodeName)
+ return metricItems
+ end
+
+ podContainers = []
+ if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty?
+ podContainers = podContainers + pod["spec"]["containers"]
+ end
+ # Adding init containers to the record list as well.
+ if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty?
+ podContainers = podContainers + pod["spec"]["initContainers"]
+ end
+
+ if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?)
+ podContainers.each do |container|
+ containerName = container["name"]
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?)
+ metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect])
+
+ metricItem = {}
+ metricItem["DataItems"] = []
+
+ metricProps = {}
+ metricProps["Timestamp"] = metricTime
+ metricProps["Host"] = nodeName
+ # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
+ metricProps["Computer"] = nodeName
+ metricProps["ObjectName"] = "K8SContainer"
+ metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName
+
+ metricProps["Collections"] = []
+ metricCollections = {}
+ metricCollections["CounterName"] = metricNametoReturn
+ metricCollections["Value"] = metricValue
+
+ metricProps["Collections"].push(metricCollections)
+ metricItem["DataItems"].push(metricProps)
+ metricItems.push(metricItem)
+ #No container level limit for the given metric, so default to node level limit
+ else
+ nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
+ if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey))
+ metricValue = @@NodeMetrics[nodeMetricsHashKey]
+ #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ")
+ metricItem = {}
+ metricItem["DataItems"] = []
+
+ metricProps = {}
+ metricProps["Timestamp"] = metricTime
+ metricProps["Host"] = nodeName
+ # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
+ metricProps["Computer"] = nodeName
+ metricProps["ObjectName"] = "K8SContainer"
+ metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName
+
+ metricProps["Collections"] = []
+ metricCollections = {}
+ metricCollections["CounterName"] = metricNametoReturn
+ metricCollections["Value"] = metricValue
+
+ metricProps["Collections"].push(metricCollections)
+ metricItem["DataItems"].push(metricProps)
+ metricItems.push(metricItem)
+ end
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ return metricItems
+ end
+ return metricItems
+ end #getContainerResourceRequestAndLimits
+
+ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItems = []
+ begin
+ clusterId = getClusterId
+ clusterName = getClusterName
+ podNameSpace = pod["metadata"]["namespace"]
+ if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences")
+ # The above case seems to be the only case where you have horizontal scaling of pods
+ # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash
+ # instead of the actual poduid. Since this uid is not being surface into the UX
+ # its ok to use this.
+ # Use kubernetes.io/config.hash to be able to correlate with cadvisor data
+ if pod["metadata"]["annotations"].nil?
+ return metricItems
+ else
+ podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"]
+ end
+ else
+ podUid = pod["metadata"]["uid"]
+ end
+
+ podContainers = []
+ if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty?
+ podContainers = podContainers + pod["spec"]["containers"]
+ end
+ # Adding init containers to the record list as well.
+ if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty?
+ podContainers = podContainers + pod["spec"]["initContainers"]
+ end
+
+ if (!podContainers.nil? && !podContainers.empty?)
+ if (!pod["spec"]["nodeName"].nil?)
+ nodeName = pod["spec"]["nodeName"]
+ else
+ nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU
+ end
+ podContainers.each do |container|
+ metricValue = nil
+ containerName = container["name"]
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?)
+ metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect])
+ else
+ #No container level limit for the given metric, so default to node level limit for non-gpu metrics
+ if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+ nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
+ metricValue = @@NodeMetrics[nodeMetricsHashKey]
+ end
+ end
+ if (!metricValue.nil?)
+ metricItem = {}
+ metricItem["CollectionTime"] = metricTime
+ metricItem["Computer"] = nodeName
+ metricItem["Name"] = metricNametoReturn
+ metricItem["Value"] = metricValue
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
+ #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace
+
+ metricItem["Tags"] = metricTags
+
+ metricItems.push(metricItem)
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ return metricItems
+ end
+ return metricItems
+ end #getContainerResourceRequestAndLimitsAsInsightsMetrics
+
+ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItems = []
+ begin
+ metricInfo = metricJSON
+ clusterId = getClusterId
+ #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
+ #if we are coming up with the time it should be same for all nodes
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ metricInfo["items"].each do |node|
+ metricItem = parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime)
+ if !metricItem.nil? && !metricItem.empty?
+ metricItems.push(metricItem)
+ end
+ end
+ rescue => error
+ @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ end
+ return metricItems
+ end #parseNodeLimits
+
+ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItem = {}
+ begin
+ clusterId = getClusterId
+ #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
+ #if we are coming up with the time it should be same for all nodes
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?)
+ # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory"
+ metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect])
+
+ metricItem["DataItems"] = []
+ metricProps = {}
+ metricProps["Timestamp"] = metricTime
+ metricProps["Host"] = node["metadata"]["name"]
+ # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
+ metricProps["Computer"] = node["metadata"]["name"]
+ metricProps["ObjectName"] = "K8SNode"
+ metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"]
+ metricProps["Collections"] = []
+ metricCollections = {}
+ metricCollections["CounterName"] = metricNametoReturn
+ metricCollections["Value"] = metricValue
+
+ metricProps["Collections"].push(metricCollections)
+ metricItem["DataItems"].push(metricProps)
+
+ #push node level metrics to a inmem hash so that we can use it looking up at container level.
+ #Currently if container level cpu & memory limits are not defined we default to node level limits
+ @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
+ #@Log.info ("Node metric hash: #{@@NodeMetrics}")
+ end
+ rescue => error
+ @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ end
+ return metricItem
+ end #parseNodeLimitsFromNodeItem
+
+ def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItem = {}
+ begin
+ #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
+ #if we are coming up with the time it should be same for all nodes
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?)
+ clusterId = getClusterId
+ clusterName = getClusterName
+
+ # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu"
+ metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect])
+
+ metricItem["CollectionTime"] = metricTime
+ metricItem["Computer"] = node["metadata"]["name"]
+ metricItem["Name"] = metricNametoReturn
+ metricItem["Value"] = metricValue
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect
+
+ metricItem["Tags"] = metricTags
+
+ #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
+ #Currently if container level cpu & memory limits are not defined we default to node level limits
+ if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+ @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
+ #@Log.info ("Node metric hash: #{@@NodeMetrics}")
+ end
+ end
+ rescue => error
+ @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ end
+ return metricItem
+ end
+
+ def getMetricNumericValue(metricName, metricVal)
+ metricValue = metricVal.downcase
+ begin
+ case metricName
+ when "memory" #convert to bytes for memory
+ #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/
+ if (metricValue.end_with?("ki"))
+ metricValue.chomp!("ki")
+ metricValue = Float(metricValue) * 1024.0 ** 1
+ elsif (metricValue.end_with?("mi"))
+ metricValue.chomp!("mi")
+ metricValue = Float(metricValue) * 1024.0 ** 2
+ elsif (metricValue.end_with?("gi"))
+ metricValue.chomp!("gi")
+ metricValue = Float(metricValue) * 1024.0 ** 3
+ elsif (metricValue.end_with?("ti"))
+ metricValue.chomp!("ti")
+ metricValue = Float(metricValue) * 1024.0 ** 4
+ elsif (metricValue.end_with?("pi"))
+ metricValue.chomp!("pi")
+ metricValue = Float(metricValue) * 1024.0 ** 5
+ elsif (metricValue.end_with?("ei"))
+ metricValue.chomp!("ei")
+ metricValue = Float(metricValue) * 1024.0 ** 6
+ elsif (metricValue.end_with?("zi"))
+ metricValue.chomp!("zi")
+ metricValue = Float(metricValue) * 1024.0 ** 7
+ elsif (metricValue.end_with?("yi"))
+ metricValue.chomp!("yi")
+ metricValue = Float(metricValue) * 1024.0 ** 8
+ elsif (metricValue.end_with?("k"))
+ metricValue.chomp!("k")
+ metricValue = Float(metricValue) * 1000.0 ** 1
+ elsif (metricValue.end_with?("m"))
+ metricValue.chomp!("m")
+ metricValue = Float(metricValue) * 1000.0 ** 2
+ elsif (metricValue.end_with?("g"))
+ metricValue.chomp!("g")
+ metricValue = Float(metricValue) * 1000.0 ** 3
+ elsif (metricValue.end_with?("t"))
+ metricValue.chomp!("t")
+ metricValue = Float(metricValue) * 1000.0 ** 4
+ elsif (metricValue.end_with?("p"))
+ metricValue.chomp!("p")
+ metricValue = Float(metricValue) * 1000.0 ** 5
+ elsif (metricValue.end_with?("e"))
+ metricValue.chomp!("e")
+ metricValue = Float(metricValue) * 1000.0 ** 6
+ elsif (metricValue.end_with?("z"))
+ metricValue.chomp!("z")
+ metricValue = Float(metricValue) * 1000.0 ** 7
+ elsif (metricValue.end_with?("y"))
+ metricValue.chomp!("y")
+ metricValue = Float(metricValue) * 1000.0 ** 8
+ else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units')
+ metricValue = Float(metricValue)
+ end
+ when "cpu" #convert to nanocores for cpu
+ #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/
+ if (metricValue.end_with?("m"))
+ metricValue.chomp!("m")
+ metricValue = Float(metricValue) * 1000.0 ** 2
+ elsif (metricValue.end_with?("k"))
+ metricValue.chomp!("k")
+ metricValue = Float(metricValue) * 1000.0
+ else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units')
+ metricValue = Float(metricValue) * 1000.0 ** 3
+ end
+ when "nvidia.com/gpu"
+ metricValue = Float(metricValue) * 1.0
+ when "amd.com/gpu"
+ metricValue = Float(metricValue) * 1.0
+ else
+ @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value")
+ metricValue = 0
+ end #case statement
+ rescue => error
+ @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 for metric value")
+ return 0
+ end
+ return metricValue
+ end # getMetricNumericValue
+
+ def getResourcesAndContinuationToken(uri, api_group: nil)
+ continuationToken = nil
+ resourceInventory = nil
+ begin
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+ resourceInfo = getKubeResourceInfo(uri, api_group: api_group)
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+ if !resourceInfo.nil?
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+ resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body))
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+ resourceInfo = nil
+ end
+ if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?)
+ continuationToken = resourceInventory["metadata"]["continue"]
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getResourcesAndContinuationToken:Failed in get resources for #{uri} and continuation token: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ resourceInventory = nil
+ end
+ return continuationToken, resourceInventory
+ end #getResourcesAndContinuationToken
+
+ def getKubeAPIServerUrl
+ apiServerUrl = nil
+ begin
+ if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"]
+ apiServerUrl = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}"
+ else
+ @Log.warn "Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri"
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getKubeAPIServerUrl:Failed #{errorStr}"
+ end
+ return apiServerUrl
+ end
+
+ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
+ kubeServiceRecords = []
+ begin
+ if (!serviceList.nil? && !serviceList.empty?)
+ servicesCount = serviceList["items"].length
+ @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}")
+ serviceList["items"].each do |item|
+ kubeServiceRecord = {}
+ kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated
+ kubeServiceRecord["ServiceName"] = item["metadata"]["name"]
+ kubeServiceRecord["Namespace"] = item["metadata"]["namespace"]
+ kubeServiceRecord["SelectorLabels"] = [item["spec"]["selector"]]
+ # added these before emit to avoid memory foot print
+ # kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId
+ # kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName
+ kubeServiceRecord["ClusterIP"] = item["spec"]["clusterIP"]
+ kubeServiceRecord["ServiceType"] = item["spec"]["type"]
+ kubeServiceRecords.push(kubeServiceRecord.dup)
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getKubeServicesInventoryRecords:Failed with an error : #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return kubeServiceRecords
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/MdmAlertTemplates.rb b/source/plugins/ruby-fluentd4/MdmAlertTemplates.rb
new file mode 100644
index 000000000..ef63cf219
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/MdmAlertTemplates.rb
@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+
+class MdmAlertTemplates
+ Pod_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/pods",
+ "dimNames": [
+ "controllerName",
+ "Kubernetes namespace"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{controllerNameDimValue}",
+ "%{namespaceDimValue}"
+ ],
+ "min": %{containerCountMetricValue},
+ "max": %{containerCountMetricValue},
+ "sum": %{containerCountMetricValue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ Stable_job_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/pods",
+ "dimNames": [
+ "controllerName",
+ "Kubernetes namespace",
+ "olderThanHours"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{controllerNameDimValue}",
+ "%{namespaceDimValue}",
+ "6"
+ ],
+ "min": %{containerCountMetricValue},
+ "max": %{containerCountMetricValue},
+ "sum": %{containerCountMetricValue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ Container_resource_utilization_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/containers",
+ "dimNames": [
+ "containerName",
+ "podName",
+ "controllerName",
+ "Kubernetes namespace",
+ "thresholdPercentage"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{containerNameDimValue}",
+ "%{podNameDimValue}",
+ "%{controllerNameDimValue}",
+ "%{namespaceDimValue}",
+ "%{thresholdPercentageDimValue}"
+ ],
+ "min": %{containerResourceUtilizationPercentage},
+ "max": %{containerResourceUtilizationPercentage},
+ "sum": %{containerResourceUtilizationPercentage},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ PV_resource_utilization_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/persistentvolumes",
+ "dimNames": [
+ "podName",
+ "node",
+ "kubernetesNamespace",
+ "volumeName",
+ "thresholdPercentage"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{podNameDimValue}",
+ "%{computerNameDimValue}",
+ "%{namespaceDimValue}",
+ "%{volumeNameDimValue}",
+ "%{thresholdPercentageDimValue}"
+ ],
+ "min": %{pvResourceUtilizationPercentage},
+ "max": %{pvResourceUtilizationPercentage},
+ "sum": %{pvResourceUtilizationPercentage},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+
+ Node_resource_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "Insights.Container/nodes",
+ "dimNames": [
+ "host"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{hostvalue}"
+ ],
+ "min": %{metricminvalue},
+ "max": %{metricmaxvalue},
+ "sum": %{metricsumvalue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ # Aggregation - Sum
+ Disk_used_percentage_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "Insights.Container/nodes",
+ "dimNames": [
+ "host",
+ "device"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{hostvalue}",
+ "%{devicevalue}"
+ ],
+ "min": %{diskUsagePercentageValue},
+ "max": %{diskUsagePercentageValue},
+ "sum": %{diskUsagePercentageValue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ Generic_metric_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "Insights.Container/%{namespaceSuffix}",
+ "dimNames": [%{dimNames}],
+ "series": [
+ {
+ "dimValues": [%{dimValues}],
+ "min": %{metricValue},
+ "max": %{metricValue},
+ "sum": %{metricValue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+end
diff --git a/source/plugins/ruby-fluentd4/MdmMetricsGenerator.rb b/source/plugins/ruby-fluentd4/MdmMetricsGenerator.rb
new file mode 100644
index 000000000..12d462e44
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/MdmMetricsGenerator.rb
@@ -0,0 +1,526 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class MdmMetricsGenerator
+ require "logger"
+ require "yajl/json_gem"
+ require "json"
+ require_relative "MdmAlertTemplates"
+ require_relative "ApplicationInsightsUtility"
+ require_relative "constants"
+ require_relative "oms_common"
+
+ @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log"
+ @log = Logger.new(@log_path, 1, 5000000)
+ @@hostName = (OMS::Common.get_hostname)
+
+ @oom_killed_container_count_hash = {}
+ @container_restart_count_hash = {}
+ @stale_job_count_hash = {}
+ @pod_ready_hash = {}
+ @pod_not_ready_hash = {}
+ @pod_ready_percentage_hash = {}
+ @zero_fill_metrics_hash = {
+ Constants::MDM_OOM_KILLED_CONTAINER_COUNT => true,
+ Constants::MDM_CONTAINER_RESTART_COUNT => true,
+ Constants::MDM_STALE_COMPLETED_JOB_COUNT => true,
+ }
+
+ @@node_metric_name_metric_percentage_name_hash = {
+ Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_NODE_CPU_USAGE_PERCENTAGE,
+ Constants::MEMORY_RSS_BYTES => Constants::MDM_NODE_MEMORY_RSS_PERCENTAGE,
+ Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE,
+ }
+
+ @@container_metric_name_metric_percentage_name_hash = {
+ Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC,
+ Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC,
+ Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC,
+ Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC,
+ }
+
+ @@pod_metric_name_metric_percentage_name_hash = {
+ Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC,
+ }
+
+ # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails
+ @sendZeroFilledMetrics = true
+ @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i
+
+ def initialize
+ end
+
+ class << self
+ def populatePodReadyPercentageHash
+ begin
+ @log.info "in populatePodReadyPercentageHash..."
+ @pod_ready_hash.each { |dim_key, value|
+ podsNotReady = @pod_not_ready_hash.key?(dim_key) ? @pod_not_ready_hash[dim_key] : 0
+ totalPods = value + podsNotReady
+ podsReadyPercentage = (value / totalPods) * 100
+ @pod_ready_percentage_hash[dim_key] = podsReadyPercentage
+ # Deleting this key value pair from not ready hash,
+ # so that we can get those dimensions for which there are 100% of the pods in not ready state
+ if (@pod_not_ready_hash.key?(dim_key))
+ @pod_not_ready_hash.delete(dim_key)
+ end
+ }
+
+ # Add 0% pod ready for these dimensions
+ if @pod_not_ready_hash.length > 0
+ @pod_not_ready_hash.each { |key, value|
+ @pod_ready_percentage_hash[key] = 0
+ }
+ end
+
+ # Cleaning up hashes after use
+ @pod_ready_hash = {}
+ @pod_not_ready_hash = {}
+ rescue => errorStr
+ @log.info "Error in populatePodReadyPercentageHash: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplate)
+ begin
+ @log.info "in appendPodMetrics..."
+ if !metricHash.empty?
+ metricHash.each { |key, value|
+ key_elements = key.split("~~")
+ if key_elements.length != 2
+ next
+ end
+
+ # get dimension values by key
+ podControllerNameDimValue = key_elements[0]
+ podNamespaceDimValue = key_elements[1]
+
+ record = metricsTemplate % {
+ timestamp: batch_time,
+ metricName: metricName,
+ controllerNameDimValue: podControllerNameDimValue,
+ namespaceDimValue: podNamespaceDimValue,
+ containerCountMetricValue: value,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(record)))
+ }
+ else
+ @log.info "No records found in hash for metric: #{metricName}"
+ end
+ rescue => errorStr
+ @log.info "Error appending pod metrics for metric: #{metricName} : #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ @log.info "Done appending PodMetrics for metric: #{metricName}..."
+ return records
+ end
+
+ def flushPodMdmMetricTelemetry
+ begin
+ properties = {}
+ # Getting the sum of all values in the hash to send a count to telemetry
+ containerRestartHashValues = @container_restart_count_hash.values
+ containerRestartMetricCount = containerRestartHashValues.inject(0) { |sum, x| sum + x }
+
+ oomKilledContainerHashValues = @oom_killed_container_count_hash.values
+ oomKilledContainerMetricCount = oomKilledContainerHashValues.inject(0) { |sum, x| sum + x }
+
+ staleJobHashValues = @stale_job_count_hash.values
+ staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x }
+
+ properties["ContainerRestarts"] = containerRestartMetricCount
+ properties["OomKilledContainers"] = oomKilledContainerMetricCount
+ properties["OldCompletedJobs"] = staleJobMetricCount
+ ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties)
+ ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {})
+ rescue => errorStr
+ @log.info "Error in flushMdmMetricTelemetry: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ @log.info "Mdm pod metric telemetry successfully flushed"
+ end
+
+ def clearPodHashes
+ @oom_killed_container_count_hash = {}
+ @container_restart_count_hash = {}
+ @stale_job_count_hash = {}
+ @pod_ready_percentage_hash = {}
+ end
+
+ def zeroFillMetricRecords(records, batch_time)
+ begin
+ @log.info "In zero fill metric records"
+ zero_fill_dim_key = [Constants::OMSAGENT_ZERO_FILL, Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL].join("~~")
+ @oom_killed_container_count_hash[zero_fill_dim_key] = @oom_killed_container_count_hash.key?(zero_fill_dim_key) ? @oom_killed_container_count_hash[zero_fill_dim_key] : 0
+ @container_restart_count_hash[zero_fill_dim_key] = @container_restart_count_hash.key?(zero_fill_dim_key) ? @container_restart_count_hash[zero_fill_dim_key] : 0
+ @stale_job_count_hash[zero_fill_dim_key] = @stale_job_count_hash.key?(zero_fill_dim_key) ? @stale_job_count_hash[zero_fill_dim_key] : 0
+
+ metric_threshold_hash = getContainerResourceUtilizationThresholds
+ container_zero_fill_dims = [Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL].join("~~")
+ containerCpuRecord = getContainerResourceUtilMetricRecords(batch_time,
+ Constants::CPU_USAGE_NANO_CORES,
+ 0,
+ container_zero_fill_dims,
+ metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES])
+ if !containerCpuRecord.nil? && !containerCpuRecord.empty? && !containerCpuRecord[0].nil? && !containerCpuRecord[0].empty?
+ records.push(containerCpuRecord[0])
+ end
+ containerMemoryRssRecord = getContainerResourceUtilMetricRecords(batch_time,
+ Constants::MEMORY_RSS_BYTES,
+ 0,
+ container_zero_fill_dims,
+ metric_threshold_hash[Constants::MEMORY_RSS_BYTES])
+ if !containerMemoryRssRecord.nil? && !containerMemoryRssRecord.empty? && !containerMemoryRssRecord[0].nil? && !containerMemoryRssRecord[0].empty?
+ records.push(containerMemoryRssRecord[0])
+ end
+ containerMemoryWorkingSetRecord = getContainerResourceUtilMetricRecords(batch_time,
+ Constants::MEMORY_WORKING_SET_BYTES,
+ 0,
+ container_zero_fill_dims,
+ metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES])
+ if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty?
+ records.push(containerMemoryWorkingSetRecord[0])
+ end
+
+ pvZeroFillDims = {}
+ pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL
+ pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL
+ pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = Constants::VOLUME_NAME_ZERO_FILL
+ pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time,
+ Constants::PV_USED_BYTES,
+ @@hostName,
+ 0,
+ pvZeroFillDims,
+ metric_threshold_hash[Constants::PV_USED_BYTES])
+ if !pvResourceUtilMetricRecord.nil? && !pvResourceUtilMetricRecord.empty? && !pvResourceUtilMetricRecord[0].nil? && !pvResourceUtilMetricRecord[0].empty?
+ records.push(pvResourceUtilMetricRecord[0])
+ end
+ rescue => errorStr
+ @log.info "Error in zeroFillMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def appendAllPodMetrics(records, batch_time)
+ begin
+ @log.info "in appendAllPodMetrics..."
+ timeDifference = (DateTime.now.to_time.to_i - @zeroFilledMetricsTimeTracker).abs
+ timeDifferenceInMinutes = timeDifference / 60
+ if @sendZeroFilledMetrics == true || (timeDifferenceInMinutes >= Constants::ZERO_FILL_METRICS_INTERVAL_IN_MINUTES)
+ records = zeroFillMetricRecords(records, batch_time)
+ # Setting it to false after startup
+ @sendZeroFilledMetrics = false
+ @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i
+ end
+ records = appendPodMetrics(records,
+ Constants::MDM_OOM_KILLED_CONTAINER_COUNT,
+ @oom_killed_container_count_hash,
+ batch_time,
+ MdmAlertTemplates::Pod_metrics_template)
+
+ records = appendPodMetrics(records,
+ Constants::MDM_CONTAINER_RESTART_COUNT,
+ @container_restart_count_hash,
+ batch_time,
+ MdmAlertTemplates::Pod_metrics_template)
+
+ records = appendPodMetrics(records,
+ Constants::MDM_STALE_COMPLETED_JOB_COUNT,
+ @stale_job_count_hash,
+ batch_time,
+ MdmAlertTemplates::Stable_job_metrics_template)
+
+ # Computer the percentage here, because we need to do this after all chunks have been processed.
+ populatePodReadyPercentageHash
+ # @log.info "@pod_ready_percentage_hash: #{@pod_ready_percentage_hash}"
+ records = appendPodMetrics(records,
+ Constants::MDM_POD_READY_PERCENTAGE,
+ @pod_ready_percentage_hash,
+ batch_time,
+ MdmAlertTemplates::Pod_metrics_template)
+ rescue => errorStr
+ @log.info "Error in appendAllPodMetrics: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage)
+ records = []
+ begin
+ if dims.nil?
+ @log.info "Dimensions nil, returning empty records"
+ return records
+ end
+ dimElements = dims.split("~~")
+ if dimElements.length != 4
+ return records
+ end
+
+ # get dimension values
+ containerName = dimElements[0]
+ podName = dimElements[1]
+ controllerName = dimElements[2]
+ podNamespace = dimElements[3]
+
+ resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % {
+ timestamp: recordTimeStamp,
+ metricName: @@container_metric_name_metric_percentage_name_hash[metricName],
+ containerNameDimValue: containerName,
+ podNameDimValue: podName,
+ controllerNameDimValue: controllerName,
+ namespaceDimValue: podNamespace,
+ containerResourceUtilizationPercentage: percentageMetricValue,
+ thresholdPercentageDimValue: thresholdPercentage,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord)))
+ rescue => errorStr
+ @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage)
+ records = []
+ begin
+ containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME]
+ pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE]
+ podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME]
+ podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID]
+ volumeName = dims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME]
+
+ resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % {
+ timestamp: recordTimeStamp,
+ metricName: @@pod_metric_name_metric_percentage_name_hash[metricName],
+ podNameDimValue: podName,
+ computerNameDimValue: computer,
+ namespaceDimValue: pvcNamespace,
+ volumeNameDimValue: volumeName,
+ pvResourceUtilizationPercentage: percentageMetricValue,
+ thresholdPercentageDimValue: thresholdPercentage,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord)))
+ rescue => errorStr
+ @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def getDiskUsageMetricRecords(record)
+ records = []
+ usedPercent = nil
+ deviceName = nil
+ hostName = nil
+ begin
+ if !record["fields"].nil?
+ usedPercent = record["fields"]["used_percent"]
+ end
+ if !record["tags"].nil?
+ deviceName = record["tags"]["device"]
+ hostName = record["tags"]["hostName"]
+ end
+ timestamp = record["timestamp"]
+ convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601
+ if !usedPercent.nil? && !deviceName.nil? && !hostName.nil?
+ diskUsedPercentageRecord = MdmAlertTemplates::Disk_used_percentage_metrics_template % {
+ timestamp: convertedTimestamp,
+ metricName: Constants::MDM_DISK_USED_PERCENTAGE,
+ hostvalue: hostName,
+ devicevalue: deviceName,
+ diskUsagePercentageValue: usedPercent,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(diskUsedPercentageRecord)))
+ end
+ rescue => errorStr
+ @log.info "Error in getDiskUsageMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def getMetricRecords(record)
+ records = []
+ begin
+ dimNames = String.new "" #mutable string
+ dimValues = String.new ""
+ noDimVal = "-"
+ metricValue = 0
+ if !record["tags"].nil?
+ dimCount = 0
+ record["tags"].each { |k, v|
+ dimCount = dimCount + 1
+ if (dimCount <= 10) #MDM = 10 dims
+ dimNames.concat("\"#{k}\"")
+ dimNames.concat(",")
+ if !v.nil? && v.length > 0
+ dimValues.concat("\"#{v}\"")
+ else
+ dimValues.concat("\"#{noDimVal}\"")
+ end
+ dimValues.concat(",")
+ end
+ }
+ if (dimNames.end_with?(","))
+ dimNames.chomp!(",")
+ end
+ if (dimValues.end_with?(","))
+ dimValues.chomp!(",")
+ end
+ end
+ timestamp = record["timestamp"]
+ convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601
+ if !record["fields"].nil?
+ record["fields"].each { |k, v|
+ if is_numeric(v)
+ metricRecord = MdmAlertTemplates::Generic_metric_template % {
+ timestamp: convertedTimestamp,
+ metricName: k,
+ namespaceSuffix: record["name"],
+ dimNames: dimNames,
+ dimValues: dimValues,
+ metricValue: v,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(metricRecord)))
+ #@log.info "pushed mdmgenericmetric: #{k},#{v}"
+ end
+ }
+ end
+ rescue => errorStr
+ @log.info "getMetricRecords:Error: #{errorStr} for record #{record}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def is_numeric(o)
+ true if Float(o) rescue false
+ end
+
+ def getContainerResourceUtilizationThresholds
+ begin
+ metric_threshold_hash = {}
+ # Initilizing with default values
+ metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD
+ metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD
+ metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD
+ metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD
+
+ cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"]
+ if !cpuThreshold.nil? && !cpuThreshold.empty?
+ #Rounding this to 2 decimal places, since this value is user configurable
+ cpuThresholdFloat = (cpuThreshold.to_f).round(2)
+ metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = cpuThresholdFloat
+ end
+
+ memoryRssThreshold = ENV["AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD"]
+ if !memoryRssThreshold.nil? && !memoryRssThreshold.empty?
+ memoryRssThresholdFloat = (memoryRssThreshold.to_f).round(2)
+ metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = memoryRssThresholdFloat
+ end
+
+ memoryWorkingSetThreshold = ENV["AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD"]
+ if !memoryWorkingSetThreshold.nil? && !memoryWorkingSetThreshold.empty?
+ memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2)
+ metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat
+ end
+
+ pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"]
+ if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty?
+ pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2)
+ metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat
+ end
+ rescue => errorStr
+ @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return metric_threshold_hash
+ end
+
+ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value)
+ records = []
+ begin
+ custommetricrecord = MdmAlertTemplates::Node_resource_metrics_template % {
+ timestamp: record["DataItems"][0]["Timestamp"],
+ metricName: metric_name,
+ hostvalue: record["DataItems"][0]["Host"],
+ objectnamevalue: record["DataItems"][0]["ObjectName"],
+ instancenamevalue: record["DataItems"][0]["InstanceName"],
+ metricminvalue: metric_value,
+ metricmaxvalue: metric_value,
+ metricsumvalue: metric_value,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(custommetricrecord)))
+
+ if !percentage_metric_value.nil?
+ additional_record = MdmAlertTemplates::Node_resource_metrics_template % {
+ timestamp: record["DataItems"][0]["Timestamp"],
+ metricName: @@node_metric_name_metric_percentage_name_hash[metric_name],
+ hostvalue: record["DataItems"][0]["Host"],
+ objectnamevalue: record["DataItems"][0]["ObjectName"],
+ instancenamevalue: record["DataItems"][0]["InstanceName"],
+ metricminvalue: percentage_metric_value,
+ metricmaxvalue: percentage_metric_value,
+ metricsumvalue: percentage_metric_value,
+ }
+ records.push(Yajl::Parser.parse(StringIO.new(additional_record)))
+ end
+ rescue => errorStr
+ @log.info "Error in getNodeResourceMetricRecords: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return records
+ end
+
+ def generateOOMKilledContainerMetrics(podControllerName, podNamespace)
+ begin
+ dim_key = [podControllerName, podNamespace].join("~~")
+ @log.info "adding dimension key to oom killed container hash..."
+ @oom_killed_container_count_hash[dim_key] = @oom_killed_container_count_hash.key?(dim_key) ? @oom_killed_container_count_hash[dim_key] + 1 : 1
+ rescue => errorStr
+ @log.warn "Error in generateOOMKilledContainerMetrics: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def generateRestartingContainersMetrics(podControllerName, podNamespace)
+ begin
+ dim_key = [podControllerName, podNamespace].join("~~")
+ @log.info "adding dimension key to container restart count hash..."
+ @container_restart_count_hash[dim_key] = @container_restart_count_hash.key?(dim_key) ? @container_restart_count_hash[dim_key] + 1 : 1
+ rescue => errorStr
+ @log.warn "Error in generateRestartingContainersMetrics: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def generatePodReadyMetrics(podControllerName, podNamespace, podReadyCondition)
+ begin
+ dim_key = [podControllerName, podNamespace].join("~~")
+ @log.info "adding dimension key to pod ready hash..."
+ if podReadyCondition == true
+ @pod_ready_hash[dim_key] = @pod_ready_hash.key?(dim_key) ? @pod_ready_hash[dim_key] + 1 : 1
+ else
+ @pod_not_ready_hash[dim_key] = @pod_not_ready_hash.key?(dim_key) ? @pod_not_ready_hash[dim_key] + 1 : 1
+ end
+ rescue => errorStr
+ @log.warn "Error in generatePodReadyMetrics: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def generateStaleJobCountMetrics(podControllerName, podNamespace)
+ begin
+ dim_key = [podControllerName, podNamespace].join("~~")
+ @log.info "adding dimension key to stale job count hash..."
+ @stale_job_count_hash[dim_key] = @stale_job_count_hash.key?(dim_key) ? @stale_job_count_hash[dim_key] + 1 : 1
+ rescue => errorStr
+ @log.warn "Error in generateStaleJobCountMetrics: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/agent_common.rb b/source/plugins/ruby-fluentd4/agent_common.rb
new file mode 100644
index 000000000..25a34a09c
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/agent_common.rb
@@ -0,0 +1,117 @@
+# This file extends the OMS::Common class and with
+# methods shared by the topology and telemetry scripts.
+# It remains separate in order to retain compatibility between
+# plugins from DSC modules and those in the shell bundle.
+
+class StrongTypedClass
+ def self.strongtyped_accessor(name, type)
+ # setter
+ self.class_eval("def #{name}=(value);
+ if !value.is_a? #{type} and !value.nil?
+ raise ArgumentError, \"Invalid data type. #{name} should be type #{type}\"
+ end
+ @#{name}=value
+ end")
+
+ # getter
+ self.class_eval("def #{name};@#{name};end")
+ end
+
+ def self.strongtyped_arch(name)
+ # setter
+ self.class_eval("def #{name}=(value);
+ if (value != 'x64' && value != 'x86')
+ raise ArgumentError, \"Invalid data for ProcessorArchitecture.\"
+ end
+ @#{name}=value
+ end")
+ end
+end
+
+module OMS
+
+ # Error codes and categories:
+ # User configuration/parameters:
+ INVALID_OPTION_PROVIDED = 2
+ NON_PRIVELEGED_USER_ERROR_CODE = 3
+ # System configuration:
+ MISSING_CONFIG_FILE = 4
+ MISSING_CONFIG = 5
+ MISSING_CERTS = 6
+ # Service/network-related:
+ HTTP_NON_200 = 7
+ ERROR_SENDING_HTTP = 8
+ ERROR_EXTRACTING_ATTRIBUTES = 9
+ MISSING_CERT_UPDATE_ENDPOINT = 10
+ # Internal errors:
+ ERROR_GENERATING_CERTS = 11
+ ERROR_WRITING_TO_FILE = 12
+
+ class Common
+
+ require 'syslog/logger'
+
+ class << self
+
+ # Helper method that returns true if a file exists and is non-empty
+ def file_exists_nonempty(file_path)
+ return (!file_path.nil? and File.exist?(file_path) and !File.zero?(file_path))
+ end
+
+ # Return logger from provided log facility
+ def get_logger(log_facility)
+
+ facility = case log_facility
+ # Custom log facilities supported by both Ruby and bash logger
+ when "auth" then Syslog::LOG_AUTHPRIV # LOG_AUTH is deprecated
+ when "authpriv" then Syslog::LOG_AUTHPRIV
+ when "cron" then Syslog::LOG_CRON
+ when "daemon" then Syslog::LOG_DAEMON
+ when "ftp" then Syslog::LOG_FTP
+ when "kern" then Syslog::LOG_KERN
+ when "lpr" then Syslog::LOG_LRP
+ when "mail" then Syslog::LOG_MAIL
+ when "news" then Syslog::LOG_NEWS
+ when "security" then Syslog::LOG_SECURITY
+ when "syslog" then Syslog::LOG_SYSLOG
+ when "user" then Syslog::LOG_USER
+ when "uucp" then Syslog::LOG_UUCP
+
+ when "local0" then Syslog::LOG_LOCAL0
+ when "local1" then Syslog::LOG_LOCAL1
+ when "local2" then Syslog::LOG_LOCAL2
+ when "local3" then Syslog::LOG_LOCAL3
+ when "local4" then Syslog::LOG_LOCAL4
+ when "local5" then Syslog::LOG_LOCAL5
+ when "local6" then Syslog::LOG_LOCAL6
+ when "local7" then Syslog::LOG_LOCAL7
+
+ # default logger will be local0
+ else Syslog::LOG_LOCAL0
+ end
+
+ if !Syslog.opened?
+ Syslog::Logger.syslog = Syslog.open("omsagent", Syslog::LOG_PID, facility)
+ end
+ return Syslog::Logger.new
+ end
+
+ # Return a POST request with the specified headers, URI, and body, and an
+ # HTTP to execute that request
+ def form_post_request_and_http(headers, uri_string, body, cert, key, proxy)
+ uri = URI.parse(uri_string)
+ req = Net::HTTP::Post.new(uri.request_uri, headers)
+ req.body = body
+
+ http = create_secure_http(uri, OMS::Configuration.get_proxy_config(proxy))
+ http.cert = cert
+ http.key = key
+
+ return req, http
+ end # form_post_request_and_http
+
+ end
+
+ end
+
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/agent_maintenance_script.rb b/source/plugins/ruby-fluentd4/agent_maintenance_script.rb
new file mode 100644
index 000000000..80ec60b68
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/agent_maintenance_script.rb
@@ -0,0 +1,639 @@
+require 'optparse'
+
+module MaintenanceModule
+
+ class Maintenance
+ require 'openssl'
+ require 'fileutils'
+ require 'net/http'
+ require 'uri'
+ require 'gyoku'
+ require 'etc'
+ require 'iso8601'
+
+ require_relative 'oms_common'
+ require_relative 'oms_configuration'
+ require_relative 'agent_topology_request_script'
+ require_relative 'agent_common'
+
+ attr_reader :AGENT_USER, :load_config_return_code
+ attr_accessor :suppress_stdout
+
+ def initialize(omsadmin_conf_path, cert_path, key_path, pid_path, proxy_path,
+ os_info, install_info, log = nil, verbose = false)
+ @suppress_logging = true # suppress_logging suppresses all output, including both print and logger
+ @suppress_stdout = false # suppress_stdout suppresses only print
+
+ @AGENT_USER = "omsagent"
+ @AGENT_GROUP = "omiusers"
+ @omsadmin_conf_path = omsadmin_conf_path
+ @cert_path = cert_path
+ @key_path = key_path
+ @pid_path = pid_path
+ @proxy_path = proxy_path
+ @os_info = os_info
+ @install_info = install_info
+ @verbose = verbose
+ # Config to be read/written from omsadmin.conf
+ @WORKSPACE_ID = nil
+ @AGENT_GUID = nil
+ @URL_TLD = nil
+ @LOG_FACILITY = nil
+ @CERTIFICATE_UPDATE_ENDPOINT = nil
+
+ @load_config_return_code = load_config
+ @logger = log.nil? ? OMS::Common.get_logger(@LOG_FACILITY) : log
+
+ @suppress_logging = false
+ end
+
+ # Return true if the current executing user is root
+ def is_current_user_root
+ return true if (Process.euid == 0)
+ end
+
+ # Return true if the user should be running this script (root or omsagent or testing)
+ def check_user
+ if (!ENV["TEST_WORKSPACE_ID"].nil? or !ENV["TEST_SHARED_KEY"].nil?) or
+ (is_current_user_root or Etc.getpwuid(Process.euid).name == @AGENT_USER)
+ return true
+ else
+ log_error("This script must be run as root or as the #{@AGENT_USER} user.")
+ return false
+ end
+ end
+
+ # Return variable derived from install_info.txt (like "LinuxMonitoringAgent/1.2.0-148")
+ def get_user_agent
+ user_agent = "LinuxMonitoringAgent/"
+ if OMS::Common.file_exists_nonempty(@install_info)
+ user_agent.concat(File.readlines(@install_info)[0].split.first)
+ end
+ return user_agent
+ end
+
+ # Ensure files generated by this script are owned by omsagent
+ def chown_omsagent(file_list)
+ if is_current_user_root
+ FileUtils.chown(@AGENT_USER, @AGENT_GROUP, file_list)
+ end
+ end
+
+ # Logging methods
+ def log_info(message)
+ print("info\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @logger.info(message) if !@suppress_logging
+ end
+
+ def log_error(message)
+ print("error\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @logger.error(message) if !@suppress_logging
+ end
+
+ def log_debug(message)
+ print("debug\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @logger.debug(message) if !@suppress_logging
+ end
+
+ # Load necessary configuration values from omsadmin.conf
+ def load_config
+ if !File.exist?(@omsadmin_conf_path)
+ log_error("Missing configuration file: #{@omsadmin_conf_path}")
+ return OMS::MISSING_CONFIG_FILE
+ end
+
+ File.open(@omsadmin_conf_path, "r").each_line do |line|
+ if line =~ /^WORKSPACE_ID/
+ @WORKSPACE_ID = line.sub("WORKSPACE_ID=","").strip
+ elsif line =~ /^AGENT_GUID/
+ @AGENT_GUID = line.sub("AGENT_GUID=","").strip
+ elsif line =~ /^URL_TLD/
+ @URL_TLD = line.sub("URL_TLD=","").strip
+ elsif line =~ /^LOG_FACILITY/
+ @LOG_FACILITY = line.sub("LOG_FACILITY=","").strip
+ elsif line =~ /^CERTIFICATE_UPDATE_ENDPOINT/
+ @CERTIFICATE_UPDATE_ENDPOINT = line.sub("CERTIFICATE_UPDATE_ENDPOINT=","").strip
+ end
+ end
+
+ return 0
+ end
+
+ # Update omsadmin.conf with the specified variable's value
+ def update_config(var, val)
+ if !File.exist?(@omsadmin_conf_path)
+ return OMS::MISSING_CONFIG_FILE
+ end
+
+ old_text = File.read(@omsadmin_conf_path)
+ new_text = old_text.sub(/^#{var}=.*\n/,"#{var}=#{val}\n")
+
+ File.open(@omsadmin_conf_path, "w") { |file|
+ file.puts(new_text)
+ }
+ end
+
+ # Updates the CERTIFICATE_UPDATE_ENDPOINT variable and renews certificate if requested
+ def apply_certificate_update_endpoint(server_resp, check_for_renew_request = true)
+ update_attr = ""
+ cert_update_endpoint = ""
+
+ # Extract the certificate update endpoint from the server response
+ endpoint_tag_regex = /\(true|false))\".*(?https.*RenewCertificate).*CertificateUpdateEndpoint\>/
+ endpoint_tag_regex.match(server_resp) { |match|
+ cert_update_endpoint = match["cert_update_endpoint"]
+ update_attr = match["update_cert"]
+ }
+
+ if cert_update_endpoint.empty?
+ log_error("Could not extract the update certificate endpoint.")
+ return OMS::MISSING_CERT_UPDATE_ENDPOINT
+ elsif update_attr.empty?
+ log_error("Could not find the updateCertificate tag in OMS Agent management service telemetry response")
+ return OMS::ERROR_EXTRACTING_ATTRIBUTES
+ end
+
+ # Update omsadmin.conf with cert_update_endpoint variable
+ @CERTIFICATE_UPDATE_ENDPOINT = cert_update_endpoint
+ # When apply_dsc_endpoint is called from onboarding, dsc_endpoint will be returned in file
+ update_config("CERTIFICATE_UPDATE_ENDPOINT", cert_update_endpoint)
+
+ # Check in the response if the certs should be renewed
+ if update_attr == "true" and check_for_renew_request
+ renew_certs_ret = renew_certs
+ if renew_certs_ret != 0
+ return renew_certs_ret
+ end
+ end
+
+ return cert_update_endpoint
+ end
+
+ # Update the DSC_ENDPOINT variable in omsadmin.conf from the server XML
+ def apply_dsc_endpoint(server_resp)
+ dsc_endpoint = ""
+
+ # Extract the DSC endpoint from the server response
+ dsc_conf_regex = /(?.*)<\/Endpoint>.*DscConfiguration>/
+ dsc_conf_regex.match(server_resp) { |match|
+ dsc_endpoint = match["endpoint"]
+ # Insert escape characters before open and closed parentheses
+ dsc_endpoint = dsc_endpoint.gsub("(", "\\\\(").gsub(")", "\\\\)")
+ }
+
+ if dsc_endpoint.empty?
+ log_error("Could not extract the DSC endpoint.")
+ return OMS::ERROR_EXTRACTING_ATTRIBUTES
+ end
+
+ # Update omsadmin.conf with dsc_endpoint variable
+ # When apply_dsc_endpoint is called from onboarding, dsc_endpoint will be returned in file
+ update_config("DSC_ENDPOINT", dsc_endpoint)
+
+ return dsc_endpoint
+ end
+
+ # Pass the server response from an XML file to apply_dsc_endpoint and apply_certificate_update_endpoint
+ # Save DSC_ENDPOINT and CERTIFICATE_UPDATE_ENDPOINT variables in file to be read outside of this script
+ def apply_endpoints_file(xml_file, output_file)
+ if !OMS::Common.file_exists_nonempty(xml_file)
+ return OMS::MISSING_CONFIG_FILE
+ end
+
+ server_resp = File.read(xml_file)
+ cert_update_applied = apply_certificate_update_endpoint(server_resp, check_for_renew_request = false)
+ dsc_applied = apply_dsc_endpoint(server_resp)
+
+ if cert_update_applied.class != String
+ return cert_update_applied
+ elsif dsc_applied.class != String
+ return dsc_applied
+ else
+ output_handle = nil
+ begin
+ # To return endpoint strings to onboarding script, save to file
+ output_handle = File.new(output_file, "w")
+ chown_omsagent(output_file)
+ output_handle.write("#{cert_update_applied}\n"\
+ "#{dsc_applied}\n")
+ rescue => e
+ log_error("Error saving endpoints to file: #{e.message}")
+ return OMS::ERROR_WRITING_TO_FILE
+ ensure
+ if !output_handle.nil?
+ output_handle.close
+ end
+ end
+ end
+
+ return 0
+ end
+
+ # Return the certificate text as a single formatted string
+ def get_cert_server(cert_path)
+ cert_server = ""
+
+ cert_file_contents = File.readlines(cert_path)
+ for i in 1..(cert_file_contents.length-2) # skip first and last line in file
+ line = cert_file_contents[i]
+ cert_server.concat(line[0..-2])
+ if i < (cert_file_contents.length-2)
+ cert_server.concat(" ")
+ end
+ end
+
+ return cert_server
+ end
+
+ # Update the topology and telemetry request frequencies
+ def apply_request_intervals(server_resp)
+ return "" if !defined?(OMS::Configuration.set_request_intervals)
+
+ topology_interval = ""
+ telemetry_interval = ""
+
+ topology_interval_regex = /queryInterval=\"(?.*?)\"/
+ topology_interval_regex.match(server_resp) { |match|
+ topology_interval = match["topologyInterval"]
+ }
+
+ telemetry_interval_regex = /telemetryReportInterval=\"(?.*?)\"/
+ telemetry_interval_regex.match(server_resp) { |match|
+ telemetry_interval = match["telemetryInterval"]
+ }
+
+ if topology_interval.empty?
+ log_error("Topology request interval not found in homing service response.")
+ return OMS::ERROR_EXTRACTING_ATTRIBUTES
+ end
+
+ if telemetry_interval.empty?
+ log_error("Telemetry request interval not found in homing service response.")
+ return OMS::ERROR_EXTRACTING_ATTRIBUTES
+ end
+
+ begin
+ topology_interval = ISO8601::Duration.new(topology_interval).to_seconds
+ telemetry_interval = ISO8601::Duration.new(telemetry_interval).to_seconds
+ rescue => e
+ OMS::Log.error_once("Error parsing request intervals. #{e}")
+ end
+
+ OMS::Configuration.set_request_intervals(topology_interval, telemetry_interval)
+
+ return ""
+ end # apply_request_intervals
+
+ # Perform a topology request against the OMS endpoint
+ def heartbeat
+ # Reload config in case of updates since last topology request
+ @load_config_return_code = load_config
+ if @load_config_return_code != 0
+ log_error("Error loading configuration from #{@omsadmin_conf_path}")
+ return @load_config_return_code
+ end
+
+ # Check necessary inputs
+ if @WORKSPACE_ID.nil? or @AGENT_GUID.nil? or @URL_TLD.nil? or
+ @WORKSPACE_ID.empty? or @AGENT_GUID.empty? or @URL_TLD.empty?
+ log_error("Missing required field from configuration file: #{@omsadmin_conf_path}")
+ return OMS::MISSING_CONFIG
+ elsif !OMS::Common.file_exists_nonempty(@cert_path) or !OMS::Common.file_exists_nonempty(@key_path)
+ log_error("Certificates for topology request do not exist")
+ return OMS::MISSING_CERTS
+ end
+
+ # Generate the request body
+ begin
+ body_hb_xml = AgentTopologyRequestHandler.new.handle_request(@os_info, @omsadmin_conf_path,
+ @AGENT_GUID, get_cert_server(@cert_path), @pid_path, telemetry=true)
+ if !xml_contains_telemetry(body_hb_xml)
+ log_debug("No Telemetry data was appended to OMS agent management service topology request")
+ end
+ rescue => e
+ log_error("Error when appending Telemetry to OMS agent management service topology request: #{e.message}")
+ end
+
+ # Form headers
+ headers = {}
+ req_date = Time.now.utc.strftime("%Y-%m-%dT%T.%N%:z")
+ headers[OMS::CaseSensitiveString.new("x-ms-Date")] = req_date
+ headers["User-Agent"] = get_user_agent
+ headers[OMS::CaseSensitiveString.new("Accept-Language")] = "en-US"
+
+ # Form POST request and HTTP
+ req,http = OMS::Common.form_post_request_and_http(headers, "https://#{@WORKSPACE_ID}.oms.#{@URL_TLD}/"\
+ "AgentService.svc/LinuxAgentTopologyRequest", body_hb_xml,
+ OpenSSL::X509::Certificate.new(File.open(@cert_path)),
+ OpenSSL::PKey::RSA.new(File.open(@key_path)), @proxy_path)
+
+ log_info("Generated topology request:\n#{req.body}") if @verbose
+
+ # Submit request
+ begin
+ res = nil
+ res = http.start { |http_each| http.request(req) }
+ rescue => e
+ log_error("Error sending the topology request to OMS agent management service: #{e.message}")
+ end
+
+ if !res.nil?
+ log_info("OMS agent management service topology request response code: #{res.code}") if @verbose
+
+ if res.code == "200"
+ cert_apply_res = apply_certificate_update_endpoint(res.body)
+ dsc_apply_res = apply_dsc_endpoint(res.body)
+ frequency_apply_res = apply_request_intervals(res.body)
+ if cert_apply_res.class != String
+ return cert_apply_res
+ elsif dsc_apply_res.class != String
+ return dsc_apply_res
+ elsif frequency_apply_res.class != String
+ return frequency_apply_res
+ else
+ log_info("OMS agent management service topology request success")
+ return 0
+ end
+ else
+ log_error("Error sending OMS agent management service topology request . HTTP code #{res.code}")
+ return OMS::HTTP_NON_200
+ end
+ else
+ log_error("Error sending OMS agent management service topology request . No HTTP code")
+ return OMS::ERROR_SENDING_HTTP
+ end
+ end
+
+ # Create the public/private key pair for the agent/workspace
+ def generate_certs(workspace_id, agent_guid)
+ if workspace_id.nil? or agent_guid.nil? or workspace_id.empty? or agent_guid.empty?
+ log_error("Both WORKSPACE_ID and AGENT_GUID must be defined to generate certificates")
+ return OMS::MISSING_CONFIG
+ end
+
+ log_info("Generating certificate ...")
+ error=nil
+
+ # Set safe certificate permissions before to prevent timing attacks
+ key_file = File.new(@key_path, "w")
+ cert_file = File.new(@cert_path, "w")
+ File.chmod(0640, @key_path)
+ File.chmod(0640, @cert_path)
+ chown_omsagent([@key_path, @cert_path])
+
+ begin
+ # Create new private key of 2048 bits
+ key = OpenSSL::PKey::RSA.new(2048)
+
+ x509_version = 2 # enable X509 V3 extensions
+ two_byte_range = 2**16 - 2 # 2 digit byte range for serial number
+ year = 1 * 365 * 24 * 60 * 60 # 365 days validity for certificate
+
+ # Generate CSR from new private key
+ csr = OpenSSL::X509::Request.new
+ csr.version = x509_version
+ csr.subject = OpenSSL::X509::Name.new([
+ ["CN", workspace_id],
+ ["CN", agent_guid],
+ ["OU", "Linux Monitoring Agent"],
+ ["O", "Microsoft"]])
+ csr.public_key = key.public_key
+ csr.sign(key, OpenSSL::Digest::SHA256.new)
+
+ # Self-sign CSR
+ csr_cert = OpenSSL::X509::Certificate.new
+ csr_cert.serial = SecureRandom.random_number(two_byte_range) + 1
+ csr_cert.version = x509_version
+ csr_cert.not_before = Time.now
+ csr_cert.not_after = Time.now + year
+ csr_cert.subject = csr.subject
+ csr_cert.public_key = csr.public_key
+ csr_cert.issuer = csr_cert.subject # self-signed
+ ef = OpenSSL::X509::ExtensionFactory.new
+ ef.subject_certificate = csr_cert
+ ef.issuer_certificate = csr_cert
+ csr_cert.add_extension(ef.create_extension("subjectKeyIdentifier","hash",false))
+ csr_cert.add_extension(ef.create_extension("authorityKeyIdentifier","keyid:always",false))
+ csr_cert.add_extension(ef.create_extension("basicConstraints","CA:TRUE",false))
+ csr_cert.sign(key, OpenSSL::Digest::SHA256.new)
+
+ # Write key and cert to files
+ key_file.write(key)
+ cert_file.write(csr_cert)
+ rescue => e
+ error = e
+ ensure
+ key_file.close
+ cert_file.close
+ end
+
+ # Check for any error or non-existent or empty files
+ if !error.nil?
+ log_error("Error generating certs: #{error.message}")
+ return OMS::ERROR_GENERATING_CERTS
+ elsif !OMS::Common.file_exists_nonempty(@cert_path) or !OMS::Common.file_exists_nonempty(@key_path)
+ log_error("Error generating certs")
+ return OMS::ERROR_GENERATING_CERTS
+ end
+
+ return 0
+ end
+
+ # Simple class to support interaction with topology script helper method (obj_to_hash)
+ class AgentRenewCertificateRequest < StrongTypedClass
+ strongtyped_accessor :NewCertificate, String
+ end
+
+ # Restore the provided public/private key to the certs files
+ def restore_old_certs(cert_old, key_old)
+ cert_file = File.open(@cert_path, "w")
+ cert_file.write(cert_old)
+ cert_file.close
+
+ key_file = File.open(@key_path, "w")
+ key_file.write(key_old)
+ key_file.close
+ end
+
+ # Renew certificates for agent/workspace connection
+ def renew_certs
+ # Check necessary inputs
+ if @load_config_return_code != 0
+ log_error("Error loading configuration from #{@omsadmin_conf_path}")
+ return @load_config_return_code
+ elsif @WORKSPACE_ID.nil? or @AGENT_GUID.nil? or @WORKSPACE_ID.empty? or @AGENT_GUID.empty?
+ log_error("Missing required field from configuration file: #{@omsadmin_conf_path}")
+ return OMS::MISSING_CONFIG
+ elsif @CERTIFICATE_UPDATE_ENDPOINT.nil? or @CERTIFICATE_UPDATE_ENDPOINT.empty?
+ log_error("Missing CERTIFICATE_UPDATE_ENDPOINT from configuration")
+ return OMS::MISSING_CONFIG
+ elsif !OMS::Common.file_exists_nonempty(@cert_path) or !OMS::Common.file_exists_nonempty(@key_path)
+ log_error("No certificates exist; cannot renew certificates")
+ return OMS::MISSING_CERTS
+ end
+
+ log_info("Renewing the certificates")
+
+ # Save old certs
+ cert_old = OpenSSL::X509::Certificate.new(File.open(@cert_path))
+ key_old = OpenSSL::PKey::RSA.new(File.open(@key_path))
+
+ generated = generate_certs(@WORKSPACE_ID, @AGENT_GUID)
+ if generated != 0
+ return generated
+ end
+
+ # Form POST request
+ renew_certs_req = AgentRenewCertificateRequest.new
+ renew_certs_req.NewCertificate = get_cert_server(@cert_path)
+
+ renew_certs_xml = "\n"
+ renew_certs_xml.concat(Gyoku.xml({ "CertificateUpdateRequest" => {:content! => obj_to_hash(renew_certs_req), \
+:'@xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", :'@xmlns:xsd' => "http://www.w3.org/2001/XMLSchema", \
+:@xmlns => "http://schemas.microsoft.com/WorkloadMonitoring/HealthServiceProtocol/2014/09/"}}))
+
+ req,http = OMS::Common.form_post_request_and_http(headers = {}, @CERTIFICATE_UPDATE_ENDPOINT,
+ renew_certs_xml, cert_old, key_old, @proxy_path)
+
+ log_info("Generated renew certificates request:\n#{req.body}") if @verbose
+
+ # Submit request
+ begin
+ res = nil
+ res = http.start { |http_each| http.request(req) }
+ rescue => e
+ log_error("Error renewing certificate: #{e.message}")
+ restore_old_certs(cert_old, key_old)
+ return OMS::ERROR_SENDING_HTTP
+ end
+
+ if !res.nil?
+ log_info("Renew certificates response code: #{res.code}") if @verbose
+
+ if res.code == "200"
+ # Do one heartbeat for the server to acknowledge the change
+ hb_return = heartbeat
+
+ if hb_return == 0
+ log_info("Certificates successfully renewed")
+ else
+ log_error("Error renewing certificate. Restoring old certs.")
+ restore_old_certs(cert_old, key_old)
+ return hb_return
+ end
+ else
+ log_error("Error renewing certificate. HTTP code #{res.code}")
+ restore_old_certs(cert_old, key_old)
+ return OMS::HTTP_NON_200
+ end
+ else
+ log_error("Error renewing certificate. No HTTP code")
+ return OMS::ERROR_SENDING_HTTP
+ end
+
+ return 0
+ end
+
+ end # class Maintenance
+end # module MaintenanceModule
+
+
+# Define the usage of this maintenance script
+def usage
+ basename = File.basename($0)
+ necessary_inputs = " "
+ print("\nMaintenance tool for OMS Agent onboarded to workspace:"\
+ "\nHeartbeat:\n"\
+ "ruby #{basename} -h #{necessary_inputs}\n"\
+ "ruby #{basename} --heartbeat #{necessary_inputs}\n"\
+ "\nRenew certificates:\n"\
+ "ruby #{basename} -r #{necessary_inputs}\n"\
+ "ruby #{basename} --renew-certs #{necessary_inputs}\n"\
+ "\nOptional: Add -v for verbose output\n")
+end
+
+if __FILE__ == $0
+ options = {}
+ OptionParser.new do |opts|
+ opts.on("-h", "--heartbeat") do |h|
+ options[:heartbeat] = h
+ end
+ opts.on("-c", "--generate-certs") do |c|
+ options[:generate_certs] = c
+ end
+ opts.on("-r", "--renew-certs") do |r|
+ options[:renew_certs] = r
+ end
+ opts.on("-w WORKSPACE_ID") do |w|
+ options[:workspace_id] = w
+ end
+ opts.on("-a AGENT_GUID") do |a|
+ options[:agent_guid] = a
+ end
+ opts.on("--endpoints XML,ENDPOINT_FILE", Array) do |e|
+ options[:apply_endpoints] = e
+ end
+ opts.on("-v", "--verbose") do |v|
+ options[:verbose] = true
+ end
+ # Note: this option only suppresses verbose output
+ opts.on("-s", "--suppress-verbose") do |s|
+ options[:verbose] = false
+ end
+ end.parse!
+
+ if (ARGV.length < 7)
+ usage
+ exit 0
+ end
+
+ omsadmin_conf_path = ARGV[0]
+ cert_path = ARGV[1]
+ key_path = ARGV[2]
+ pid_path = ARGV[3]
+ proxy_path = ARGV[4]
+ os_info = ARGV[5]
+ install_info = ARGV[6]
+
+ maintenance = MaintenanceModule::Maintenance.new(omsadmin_conf_path, cert_path, key_path,
+ pid_path, proxy_path, os_info, install_info, log = nil, options[:verbose])
+ ret_code = 0
+
+ if !maintenance.check_user
+ ret_code = OMS::NON_PRIVELEGED_USER_ERROR_CODE
+
+ elsif options[:heartbeat]
+ ret_code = maintenance.heartbeat
+
+ elsif options[:generate_certs]
+ if ENV["TEST_WORKSPACE_ID"].nil? and ENV["TEST_SHARED_KEY"].nil? and !maintenance.is_current_user_root
+ usage # generate_certs only intended for onboarding script and testing
+ ret_code = OMS::INVALID_OPTION_PROVIDED
+ elsif options[:workspace_id].nil? or options[:agent_guid].nil?
+ print("To generate certificates, you must include both -w WORKSPACE_ID and -a AGENT_GUID")
+ ret_code = OMS::INVALID_OPTION_PROVIDED
+ else
+ ret_code = maintenance.generate_certs(options[:workspace_id], options[:agent_guid])
+ end
+
+ elsif options[:renew_certs]
+ ret_code = maintenance.renew_certs
+
+ elsif options[:apply_endpoints]
+ if ENV["TEST_WORKSPACE_ID"].nil? and ENV["TEST_SHARED_KEY"].nil? and !maintenance.is_current_user_root
+ usage # apply_endpoints only intended for onboarding script and testing
+ ret_code = OMS::INVALID_OPTION_PROVIDED
+ elsif options[:apply_endpoints].length != 2
+ print("To apply the endpoints, you must include both input XML and output file: "\
+ "--endpoints XML,ENDPOINT_FILE\n")
+ ret_code = OMS::INVALID_OPTION_PROVIDED
+ else
+ ret_code = maintenance.apply_endpoints_file(options[:apply_endpoints][0], options[:apply_endpoints][1])
+ end
+
+ else
+ usage
+ end
+
+ exit ret_code
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/agent_telemetry_script.rb b/source/plugins/ruby-fluentd4/agent_telemetry_script.rb
new file mode 100644
index 000000000..d3e56e15c
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/agent_telemetry_script.rb
@@ -0,0 +1,412 @@
+require 'optparse'
+
+module OMS
+
+ require_relative 'agent_common'
+
+ # Operation Types
+ SEND_BATCH = "SendBatch"
+
+ class AgentResourceUsage < StrongTypedClass
+ strongtyped_accessor :OMSMaxMemory, Integer
+ strongtyped_accessor :OMSMaxPercentMemory, Integer
+ strongtyped_accessor :OMSMaxUserTime, Integer
+ strongtyped_accessor :OMSMaxSystemTime, Integer
+ strongtyped_accessor :OMSAvgMemory, Integer
+ strongtyped_accessor :OMSAvgPercentMemory, Integer
+ strongtyped_accessor :OMSAvgUserTime, Integer
+ strongtyped_accessor :OMSAvgSystemTime, Integer
+ strongtyped_accessor :OMIMaxMemory, Integer
+ strongtyped_accessor :OMIMaxPercentMemory, Integer
+ strongtyped_accessor :OMIMaxUserTime, Integer
+ strongtyped_accessor :OMIMaxSystemTime, Integer
+ strongtyped_accessor :OMIAvgMemory, Integer
+ strongtyped_accessor :OMIAvgPercentMemory, Integer
+ strongtyped_accessor :OMIAvgUserTime, Integer
+ strongtyped_accessor :OMIAvgSystemTime, Integer
+ end
+
+ class AgentQoS < StrongTypedClass
+ strongtyped_accessor :Operation, String
+ strongtyped_accessor :OperationSuccess, String
+ strongtyped_accessor :Message, String
+ strongtyped_accessor :Source, String
+ strongtyped_accessor :BatchCount, Integer
+ strongtyped_accessor :MinBatchEventCount, Integer
+ strongtyped_accessor :MaxBatchEventCount, Integer
+ strongtyped_accessor :AvgBatchEventCount, Integer
+ strongtyped_accessor :MinEventSize, Integer
+ strongtyped_accessor :MaxEventSize, Integer
+ strongtyped_accessor :AvgEventSize, Integer
+ strongtyped_accessor :MinLocalLatencyInMs, Integer
+ strongtyped_accessor :MaxLocalLatencyInMs, Integer
+ strongtyped_accessor :AvgLocalLatencyInMs, Integer
+ strongtyped_accessor :NetworkLatencyInMs, Integer
+ end
+
+ class AgentTelemetry < StrongTypedClass
+ strongtyped_accessor :OSType, String
+ strongtyped_accessor :OSDistro, String
+ strongtyped_accessor :OSVersion, String
+ strongtyped_accessor :Region, String
+ strongtyped_accessor :ConfigMgrEnabled, String
+ strongtyped_accessor :AgentResourceUsage, AgentResourceUsage
+ strongtyped_accessor :AgentQoS, Array # of AgentQoS
+
+ def serialize
+ qos_hash = self.AgentQoS.map! { |qos| obj_to_hash(qos) }
+ hash = obj_to_hash(self)
+ hash["AgentQoS"] = qos_hash
+ return hash.to_json
+ end
+ end
+
+ class Telemetry
+
+ require 'json'
+
+ require_relative 'omslog'
+ require_relative 'oms_common'
+ require_relative 'oms_configuration'
+ require_relative 'agent_maintenance_script'
+
+ attr_reader :ru_points
+ attr_accessor :suppress_stdout
+
+ QOS_EVENTS_LIMIT = 1000
+
+ @@qos_events = {}
+
+ def initialize(omsadmin_conf_path, cert_path, key_path, pid_path, proxy_path, os_info, install_info, log = nil, verbose = false)
+ @pids = {oms: 0, omi: 0}
+ @ru_points = {oms: {usr_cpu: [], sys_cpu: [], amt_mem: [], pct_mem: []},
+ omi: {usr_cpu: [], sys_cpu: [], amt_mem: [], pct_mem: []}}
+
+ @omsadmin_conf_path = omsadmin_conf_path
+ @cert_path = cert_path
+ @key_path = key_path
+ @pid_path = pid_path
+ @proxy_path = proxy_path
+ @os_info = os_info
+ @install_info = install_info
+
+ @load_config_return_code = OMS::Configuration.load_configuration(@omsadmin_conf_path, @cert_path, @key_path)
+
+ @workspace_id = OMS::Configuration.workspace_id
+ @agent_guid = OMS::Configuration.agent_guid
+ @url_tld = OMS::Configuration.url_tld
+ @log_facility = OMS::Configuration.log_facility
+
+ @log = log.nil? ? OMS::Common.get_logger(@log_facility) : log
+ @verbose = verbose
+
+ @suppress_stdout = false
+ @suppress_logging = false
+ end # initialize
+
+ # Logging methods
+ def log_info(message)
+ print("info\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @log.info(message) if !@suppress_logging
+ end
+
+ def log_error(message)
+ print("error\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @log.error(message) if !@suppress_logging
+ end
+
+ def log_debug(message)
+ print("debug\t#{message}\n") if !@suppress_logging and !@suppress_stdout
+ @log.debug(message) if !@suppress_logging
+ end
+
+ # Must be a class method in order to be exposed to all out_*.rb pushing qos events
+ def self.push_qos_event(operation, operation_success, message, source, batch, count, time)
+ begin
+ event = { op: operation, op_success: operation_success, m: message, c: count, t: time }
+ if batch.is_a? Hash and batch.has_key?('DataItems')
+ records = batch['DataItems']
+ if batch['DataItems'][0].has_key?('Timestamp')
+ now = Time.now
+ times = records.map { |record| now - Time.parse(record['Timestamp']) }
+ event[:min_l] = times[-1] # Records appear in order, so last record will have lowest latency
+ event[:max_l] = times[0]
+ event[:sum_l] = times.sum
+ end
+ sizes = records.map { |record| OMS::Common.parse_json_record_encoding(record) }.compact.map(&:bytesize) # Remove possible nil entries with compact
+ elsif batch.is_a? Array
+ # These other logs, such as custom logs, don't have a parsed timestamp
+ sizes = batch.map { |record| OMS::Common.parse_json_record_encoding(record) }.compact.map(&:bytesize)
+ else
+ return
+ end
+
+ event[:min_s] = sizes.min
+ event[:max_s] = sizes.max
+ event[:sum_s] = sizes.sum
+
+ if @@qos_events.has_key?(source)
+ if @@qos_events[source].size >= QOS_EVENTS_LIMIT
+ @@qos_events[source].shift # remove oldest qos event to cap memory use
+ end
+ @@qos_events[source] << event
+ else
+ @@qos_events[source] = [event]
+ end
+ rescue => e
+ OMS::Log.error_once("Error pushing QoS event. #{e}")
+ end
+ end # push_qos_event
+
+ def self.array_avg(array)
+ if array.empty?
+ return 0
+ else
+ return (array.reduce(0, :+) / array.size.to_f).to_i
+ end
+ end # array_avg
+
+ def get_pids()
+ @pids.each do |key, value|
+ case key
+ when :oms
+ begin
+ if File.exist?(@pid_path) and File.readable?(@pid_path)
+ @pids[key] = File.read(@pid_path).to_i
+ end
+ rescue => e
+ log_error("Error reading omsagent pid file. #{e}")
+ end
+ when :omi
+ @pids[key] = `pgrep -U omsagent omiagent`.to_i
+ end
+ end
+ end
+
+ def poll_resource_usage()
+ get_pids
+ command = "/opt/omi/bin/omicli wql root/scx \"SELECT PercentUserTime, PercentPrivilegedTime, UsedMemory, "\
+ "PercentUsedMemory FROM SCX_UnixProcessStatisticalInformation where Handle like '%s'\" | grep ="
+ begin
+ if ENV['TEST_WORKSPACE_ID'].nil? && ENV['TEST_SHARED_KEY'].nil? && File.exist?(@omsadmin_conf_path)
+ @pids.each do |key, value|
+ if !value.zero?
+ `#{command % value}`.each_line do |line|
+ @ru_points[key][:usr_cpu] << line.sub("PercentUserTime=","").strip.to_i if line =~ /PercentUserTime/
+ @ru_points[key][:sys_cpu] << line.sub("PercentPrivilegedTime=", "").strip.to_i if line =~ /PercentPrivilegedTime/
+ @ru_points[key][:amt_mem] << line.sub("UsedMemory=", "").strip.to_i if line =~ / UsedMemory/
+ @ru_points[key][:pct_mem] << line.sub("PercentUsedMemory=", "").strip.to_i if line =~ /PercentUsedMemory/
+ end
+ else # pad with zeros when OMI might not be running
+ @ru_points[key][:usr_cpu] << 0
+ @ru_points[key][:sys_cpu] << 0
+ @ru_points[key][:amt_mem] << 0
+ @ru_points[key][:pct_mem] << 0
+ end
+ end
+ end
+ rescue => e
+ log_error("Error polling resource usage. #{e}")
+ end
+ end # poll_resource_usage
+
+ def calculate_resource_usage()
+ begin
+ resource_usage = AgentResourceUsage.new
+ resource_usage.OMSMaxMemory = @ru_points[:oms][:amt_mem].max
+ resource_usage.OMSMaxPercentMemory = @ru_points[:oms][:pct_mem].max
+ resource_usage.OMSMaxUserTime = @ru_points[:oms][:usr_cpu].max
+ resource_usage.OMSMaxSystemTime = @ru_points[:oms][:sys_cpu].max
+ resource_usage.OMSAvgMemory = Telemetry.array_avg(@ru_points[:oms][:amt_mem])
+ resource_usage.OMSAvgPercentMemory = Telemetry.array_avg(@ru_points[:oms][:pct_mem])
+ resource_usage.OMSAvgUserTime = Telemetry.array_avg(@ru_points[:oms][:usr_cpu])
+ resource_usage.OMSAvgSystemTime = Telemetry.array_avg(@ru_points[:oms][:sys_cpu])
+ resource_usage.OMIMaxMemory = @ru_points[:omi][:amt_mem].max
+ resource_usage.OMIMaxPercentMemory = @ru_points[:omi][:pct_mem].max
+ resource_usage.OMIMaxUserTime = @ru_points[:omi][:usr_cpu].max
+ resource_usage.OMIMaxSystemTime = @ru_points[:omi][:sys_cpu].max
+ resource_usage.OMIAvgMemory = Telemetry.array_avg(@ru_points[:omi][:amt_mem])
+ resource_usage.OMIAvgPercentMemory = Telemetry.array_avg(@ru_points[:omi][:pct_mem])
+ resource_usage.OMIAvgUserTime = Telemetry.array_avg(@ru_points[:omi][:usr_cpu])
+ resource_usage.OMIAvgSystemTime = Telemetry.array_avg(@ru_points[:omi][:sys_cpu])
+ clear_ru_points
+ return resource_usage
+ rescue => e
+ log_error("Error calculating resource usage. #{e}")
+ return nil
+ end
+ end
+
+ def clear_ru_points
+ @ru_points.each do |process, metrics|
+ metrics.each do |key, value|
+ value.clear
+ end
+ end
+ end
+
+ def calculate_qos()
+ # for now, since we are only instrumented to emit upload success, only merge based on source
+ qos = []
+
+ begin
+ @@qos_events.each do |source, batches|
+ qos_event = AgentQoS.new
+ qos_event.Source = source
+ qos_event.Message = batches[0][:m]
+ qos_event.OperationSuccess = batches[0][:op_success]
+ qos_event.BatchCount = batches.size
+ qos_event.Operation = batches[0][:op]
+
+ counts = batches.map { |batch| batch[:c] }
+ qos_event.MinBatchEventCount = counts.min
+ qos_event.MaxBatchEventCount = counts.max
+ qos_event.AvgBatchEventCount = Telemetry.array_avg(counts)
+
+ qos_event.MinEventSize = batches.map { |batch| batch[:min_s] }.min
+ qos_event.MaxEventSize = batches.map { |batch| batch[:max_s] }.max
+ qos_event.AvgEventSize = batches.map { |batch| batch[:sum_s] }.sum / counts.sum
+
+ if batches[0].has_key? :min_l
+ qos_event.MinLocalLatencyInMs = (batches[-1][:min_l] * 1000).to_i # Latest batch will have smallest minimum latency
+ qos_event.MaxLocalLatencyInMs = (batches[0][:max_l] * 1000).to_i
+ qos_event.AvgLocalLatencyInMs = (((batches.map { |batch| batch[:sum_l] }).sum / counts.sum.to_f) * 1000).to_i
+ else
+ qos_event.MinLocalLatencyInMs = 0
+ qos_event.MaxLocalLatencyInMs = 0
+ qos_event.AvgLocalLatencyInMs = 0
+ end
+
+ qos_event.NetworkLatencyInMs = (((batches.map { |batch| batch[:t] }).sum / batches.size.to_f) * 1000).to_i # average
+
+ qos << qos_event
+ end
+ rescue => e
+ log_error("Error calculating QoS. #{e}")
+ return nil
+ end
+
+ @@qos_events.clear
+ return qos
+ end
+
+ def create_body()
+ begin
+ agent_telemetry = AgentTelemetry.new
+ agent_telemetry.OSType = "Linux"
+ File.open(@os_info).each_line do |line|
+ agent_telemetry.OSDistro = line.sub("OSName=","").strip if line =~ /OSName/
+ agent_telemetry.OSVersion = line.sub("OSVersion=","").strip if line =~ /OSVersion/
+ end
+ agent_telemetry.Region = OMS::Configuration.azure_region
+ agent_telemetry.ConfigMgrEnabled = File.exist?("/etc/opt/omi/conf/omsconfig/omshelper_disable").to_s
+ agent_telemetry.AgentResourceUsage = calculate_resource_usage
+ agent_telemetry.AgentQoS = calculate_qos
+ return (!agent_telemetry.AgentResourceUsage.nil? and !agent_telemetry.AgentQoS.nil?) ? agent_telemetry : nil
+ rescue => e
+ log_error("Error creating telemetry request body. #{e}")
+ return nil
+ end
+ end
+
+ def heartbeat()
+ # Check necessary inputs
+ if @workspace_id.nil? or @agent_guid.nil? or @url_tld.nil? or
+ @workspace_id.empty? or @agent_guid.empty? or @url_tld.empty?
+ log_error("Missing required field from configuration file: #{@omsadmin_conf_path}")
+ return OMS::MISSING_CONFIG
+ elsif !OMS::Common.file_exists_nonempty(@cert_path) or !OMS::Common.file_exists_nonempty(@key_path)
+ log_error("Certificates for topology request do not exist")
+ return OMS::MISSING_CERTS
+ end
+
+ # Generate the request body
+ body = create_body
+ return if body.nil?
+ body = body.serialize
+
+ # Form headers
+ headers = {}
+ req_date = Time.now.utc.strftime("%Y-%m-%dT%T.%N%:z")
+ headers[OMS::CaseSensitiveString.new("x-ms-Date")] = req_date
+ headers["User-Agent"] = "LinuxMonitoringAgent/".concat(OMS::Common.get_agent_version)
+ headers[OMS::CaseSensitiveString.new("Accept-Language")] = "en-US"
+
+ # Form POST request and HTTP
+ uri = "https://#{@workspace_id}.oms.#{@url_tld}/AgentService.svc/AgentTelemetry"
+ req,http = OMS::Common.form_post_request_and_http(headers, uri, body,
+ OpenSSL::X509::Certificate.new(File.open(@cert_path)),
+ OpenSSL::PKey::RSA.new(File.open(@key_path)), @proxy_path)
+
+ log_info("Generated telemetry request:\n#{req.body}") if @verbose
+
+ # Submit request
+ begin
+ res = nil
+ res = http.start { |http_each| http.request(req) }
+ rescue => e
+ log_error("Error sending the telemetry request to OMS agent management service: #{e.message}")
+ end
+
+ if !res.nil?
+ log_info("OMS agent management service telemetry request response code: #{res.code}") if @verbose
+
+ if res.code == "200"
+ log_info("OMS agent management service telemetry request success")
+ return 0
+ else
+ log_error("Error sending OMS agent management service telemetry request. HTTP code #{res.code}")
+ return OMS::HTTP_NON_200
+ end
+ else
+ log_error("Error sending OMS agent management service telemetry request. No HTTP code")
+ return OMS::ERROR_SENDING_HTTP
+ end
+ end # heartbeat
+
+ end # class Telemetry
+end # module OMS
+
+# Define the usage of this telemetry script
+def usage
+ basename = File.basename($0)
+ necessary_inputs = " "
+ print("\nTelemetry tool for OMS Agent\n"\
+ "ruby #{basename} #{necessary_inputs}\n"\
+ "\nOptional: Add -v for verbose output\n")
+end
+
+if __FILE__ == $0
+ options = {}
+ OptionParser.new do |opts|
+ opts.on("-v", "--verbose") do |v|
+ options[:verbose] = true
+ end
+ end.parse!
+
+ if (ARGV.length < 7)
+ usage
+ exit 0
+ end
+
+ omsadmin_conf_path = ARGV[0]
+ cert_path = ARGV[1]
+ key_path = ARGV[2]
+ pid_path = ARGV[3]
+ proxy_path = ARGV[4]
+ os_info = ARGV[5]
+ install_info = ARGV[6]
+
+ require 'fluent/log'
+ require 'fluent/engine'
+
+ $log = Fluent::Log.new(STDERR, Fluent::Log::LEVEL_TRACE)
+
+ telemetry = OMS::Telemetry.new(omsadmin_conf_path, cert_path, key_path,
+ pid_path, proxy_path, os_info, install_info, log = nil, options[:verbose])
+
+ telemetry.poll_resource_usage
+ telemetry.heartbeat
+
+ exit 0
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/agent_topology_request_script.rb b/source/plugins/ruby-fluentd4/agent_topology_request_script.rb
new file mode 100644
index 000000000..1f1d543d7
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/agent_topology_request_script.rb
@@ -0,0 +1,179 @@
+require 'optparse'
+require 'gyoku'
+require 'rexml/document'
+
+require_relative 'agent_common'
+
+class AgentTopologyRequestOperatingSystemTelemetry < StrongTypedClass
+ strongtyped_accessor :PercentUserTime, Integer
+ strongtyped_accessor :PercentPrivilegedTime, Integer
+ strongtyped_accessor :UsedMemory, Integer
+ strongtyped_accessor :PercentUsedMemory, Integer
+end
+
+class AgentTopologyRequestOperatingSystem < StrongTypedClass
+ strongtyped_accessor :Name, String
+ strongtyped_accessor :Manufacturer, String
+ strongtyped_arch :ProcessorArchitecture
+ strongtyped_accessor :Version, String
+ strongtyped_accessor :InContainer, String
+ strongtyped_accessor :InContainerVersion, String
+ strongtyped_accessor :IsAKSEnvironment, String
+ strongtyped_accessor :K8SVersion, String
+ strongtyped_accessor :Telemetry, AgentTopologyRequestOperatingSystemTelemetry
+end
+
+class AgentTopologyRequest < StrongTypedClass
+
+ strongtyped_accessor :FullyQualfiedDomainName, String
+ strongtyped_accessor :EntityTypeId, String
+ strongtyped_accessor :AuthenticationCertificate, String
+ strongtyped_accessor :OperatingSystem, AgentTopologyRequestOperatingSystem
+
+ def get_telemetry_data(os_info, conf_omsadmin, pid_file)
+ os = AgentTopologyRequestOperatingSystem.new
+ telemetry = AgentTopologyRequestOperatingSystemTelemetry.new
+
+ if !File.exist?(os_info) && !File.readable?(os_info)
+ raise ArgumentError, " Unable to read file #{os_info}; telemetry information will not be sent to server"
+ end
+
+ if File.exist?('/var/opt/microsoft/docker-cimprov/state/containerhostname')
+ os.InContainer = "True"
+ containerimagetagfile = '/var/opt/microsoft/docker-cimprov/state/omscontainertag'
+ if File.exist?(containerimagetagfile) && File.readable?(containerimagetagfile)
+ os.InContainerVersion = File.read(containerimagetagfile)
+ end
+ if !ENV['AKS_RESOURCE_ID'].nil?
+ os.IsAKSEnvironment = "True"
+ end
+ k8sversionfile = "/var/opt/microsoft/docker-cimprov/state/kubeletversion"
+ if File.exist?(k8sversionfile) && File.readable?(k8sversionfile)
+ os.K8SVersion = File.read(k8sversionfile)
+ end
+ else
+ os.InContainer = "False"
+ end
+
+ # Get process stats from omsagent for telemetry
+ if ENV['TEST_WORKSPACE_ID'].nil? && ENV['TEST_SHARED_KEY'].nil? && File.exist?(conf_omsadmin)
+ process_stats = ""
+ # If there is no PID file, the omsagent process has not started, so no telemetry
+ if File.exist?(pid_file) and File.readable?(pid_file)
+ pid = File.read(pid_file)
+ process_stats = `/opt/omi/bin/omicli wql root/scx \"SELECT PercentUserTime, PercentPrivilegedTime, UsedMemory, PercentUsedMemory FROM SCX_UnixProcessStatisticalInformation where Handle like '#{pid}'\" | grep =`
+ end
+
+ process_stats.each_line do |line|
+ telemetry.PercentUserTime = line.sub("PercentUserTime=","").strip.to_i if line =~ /PercentUserTime/
+ telemetry.PercentPrivilegedTime = line.sub("PercentPrivilegedTime=", "").strip.to_i if line =~ /PercentPrivilegedTime/
+ telemetry.UsedMemory = line.sub("UsedMemory=", "").strip.to_i if line =~ / UsedMemory/
+ telemetry.PercentUsedMemory = line.sub("PercentUsedMemory=", "").strip.to_i if line =~ /PercentUsedMemory/
+ end
+ end
+
+ # Get OS info from scx-release
+ File.open(os_info).each_line do |line|
+ os.Name = line.sub("OSName=","").strip if line =~ /OSName/
+ os.Manufacturer = line.sub("OSManufacturer=","").strip if line =~ /OSManufacturer/
+ os.Version = line.sub("OSVersion=","").strip if line =~ /OSVersion/
+ end
+
+ self.OperatingSystem = os
+ os.Telemetry = telemetry
+ os.ProcessorArchitecture = "x64"
+
+ # If OperatingSystem is sent in the topology request with nil OS Name, Manufacturer or Version, we get HTTP 403 error
+ if !os.Name || !os.Manufacturer || !os.Version
+ self.OperatingSystem = nil
+ end
+ end
+end
+
+def obj_to_hash(obj)
+ hash = {}
+ obj.instance_variables.each { |var|
+ val = obj.instance_variable_get(var)
+ next if val.nil?
+ if val.is_a?(AgentTopologyRequestOperatingSystemTelemetry)
+ # Put properties of Telemetry class into :attributes["Telemetry"]
+ # so that Gyoku can convert these to attributes for
+ telemetry_hash = {"Telemetry" => "", :attributes! => {"Telemetry" => obj_to_hash(val)} }
+ hash.update(telemetry_hash)
+ elsif val.is_a? StrongTypedClass
+ hash[var.to_s.delete("@")] = obj_to_hash(val)
+ else
+ hash[var.to_s.delete("@")] = val
+ end
+ }
+ return hash
+end
+
+def evaluate_fqdn()
+ hostname = `hostname`
+ domainname = `hostname -d 2> /dev/null`
+
+ if !domainname.nil? and !domainname.empty?
+ return "#{hostname}.#{domainname}"
+ end
+ return hostname
+end
+
+class AgentTopologyRequestHandler < StrongTypedClass
+ def handle_request(os_info, conf_omsadmin, entity_type_id, auth_cert, pid_file, telemetry)
+ topology_request = AgentTopologyRequest.new
+ topology_request.FullyQualfiedDomainName = evaluate_fqdn()
+ topology_request.EntityTypeId = entity_type_id
+ topology_request.AuthenticationCertificate = auth_cert
+
+ if telemetry
+ topology_request.get_telemetry_data(os_info, conf_omsadmin, pid_file)
+ end
+
+ body_heartbeat = "\n"
+ body_heartbeat.concat(Gyoku.xml({ "AgentTopologyRequest" => {:content! => obj_to_hash(topology_request), \
+:'@xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", :'@xmlns:xsd' => "http://www.w3.org/2001/XMLSchema", \
+:@xmlns => "http://schemas.microsoft.com/WorkloadMonitoring/HealthServiceProtocol/2014/09/"}}))
+
+ return body_heartbeat
+ end
+end
+
+# Returns true if the provided XML string has Operating System Telemetry within it
+def xml_contains_telemetry(xmlstring)
+ if xmlstring.nil? or xmlstring.empty?
+ return false
+ end
+
+ doc = REXML::Document.new(xmlstring)
+ if !doc.root.nil? and doc.root.elements.respond_to? :each
+ doc.root.elements.each do |root_elem|
+ if root_elem.name == "OperatingSystem" and root_elem.elements.respond_to? :each
+ root_elem.elements.each do |op_sys_elem|
+ if op_sys_elem.name == "Telemetry" and !op_sys_elem.attributes.empty?
+ return true
+ end
+ end
+ end
+ end
+ end
+
+ return false
+end
+
+if __FILE__ == $0
+ options = {}
+ OptionParser.new do |opts|
+ opts.on("-t", "--[no-]telemetry") do |t|
+ options[:telemetry] = t
+ end
+ end.parse!
+
+ topology_request_xml = AgentTopologyRequestHandler.new.handle_request(ARGV[1], ARGV[2],
+ ARGV[3], ARGV[4], ARGV[5], options[:telemetry])
+
+ path = ARGV[0]
+ File.open(path, 'a') do |f|
+ f << topology_request_xml
+ end
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/arc_k8s_cluster_identity.rb b/source/plugins/ruby-fluentd4/arc_k8s_cluster_identity.rb
new file mode 100644
index 000000000..571d6aeb0
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/arc_k8s_cluster_identity.rb
@@ -0,0 +1,216 @@
+# frozen_string_literal: true
+require "logger"
+require "net/http"
+require "net/https"
+require "uri"
+require "yajl/json_gem"
+require "base64"
+require "time"
+require_relative "KubernetesApiClient"
+require_relative "ApplicationInsightsUtility"
+
+class ArcK8sClusterIdentity
+ # this arc k8s crd version and arc k8s uses corresponding version v1beta1 vs v1 based on the k8s version for apiextensions.k8s.io
+ @@cluster_config_crd_api_version = "clusterconfig.azure.com/v1beta1"
+ @@cluster_identity_resource_name = "container-insights-clusteridentityrequest"
+ @@cluster_identity_resource_namespace = "azure-arc"
+ @@cluster_identity_token_secret_namespace = "azure-arc"
+ @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}"
+ @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}"
+ @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/"
+ @@cluster_identity_request_kind = "AzureClusterIdentityRequest"
+
+ def initialize
+ @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log"
+ @log = Logger.new(@LogPath, 1, 5000000)
+ @log.info "initialize start @ #{Time.now.utc.iso8601}"
+ @token_expiry_time = Time.now
+ @cached_access_token = String.new
+ @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+ @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+ @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl
+ if @kube_api_server_url.nil?
+ @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}"
+ end
+ @http_client = get_http_client
+ @service_account_token = get_service_account_token
+ @log.info "initialize complete @ #{Time.now.utc.iso8601}"
+ end
+
+ def get_cluster_identity_token()
+ begin
+ # get the cluster msi identity token either if its empty or near expirty. Token is valid 24 hrs.
+ if @cached_access_token.to_s.empty? || (Time.now + 60 * 60 > @token_expiry_time) # Refresh token 1 hr from expiration
+ # renew the token if its near expiry
+ if !@cached_access_token.to_s.empty? && (Time.now + 60 * 60 > @token_expiry_time)
+ @log.info "renewing the token since its near expiry @ #{Time.now.utc.iso8601}"
+ renew_near_expiry_token
+ # sleep 60 seconds to get the renewed token available
+ sleep 60
+ end
+ @log.info "get token reference from crd @ #{Time.now.utc.iso8601}"
+ tokenReference = get_token_reference_from_crd
+ if !tokenReference.nil? && !tokenReference.empty?
+ @token_expiry_time = Time.parse(tokenReference["expirationTime"])
+ token_secret_name = tokenReference["secretName"]
+ token_secret_data_name = tokenReference["dataName"]
+ # get the token from secret
+ @log.info "get token from secret @ #{Time.now.utc.iso8601}"
+ token = get_token_from_secret(token_secret_name, token_secret_data_name)
+ if !token.nil?
+ @cached_access_token = token
+ else
+ @log.warn "got token nil from secret: #{@token_secret_name}"
+ end
+ else
+ @log.warn "got token reference either nil or empty"
+ end
+ end
+ rescue => err
+ @log.warn "get_cluster_identity_token failed: #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ return @cached_access_token
+ end
+
+ private
+
+ def get_token_from_secret(token_secret_name, token_secret_data_name)
+ token = nil
+ begin
+ secret_request_uri = @@secret_resource_uri_template % {
+ kube_api_server_url: @kube_api_server_url,
+ cluster_identity_token_secret_namespace: @@cluster_identity_token_secret_namespace,
+ token_secret_name: token_secret_name,
+ }
+ get_request = Net::HTTP::Get.new(secret_request_uri)
+ get_request["Authorization"] = "Bearer #{@service_account_token}"
+ @log.info "Making GET request to #{secret_request_uri} @ #{Time.now.utc.iso8601}"
+ get_response = @http_client.request(get_request)
+ @log.info "Got response of #{get_response.code} for #{secret_request_uri} @ #{Time.now.utc.iso8601}"
+ if get_response.code.to_i == 200
+ token_secret = JSON.parse(get_response.body)["data"]
+ cluster_identity_token = token_secret[token_secret_data_name]
+ token = Base64.decode64(cluster_identity_token)
+ end
+ rescue => err
+ @log.warn "get_token_from_secret API call failed: #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ return token
+ end
+
+ private
+
+ def get_token_reference_from_crd()
+ tokenReference = {}
+ begin
+ crd_request_uri = @@crd_resource_uri_template % {
+ kube_api_server_url: @kube_api_server_url,
+ cluster_config_crd_api_version: @@cluster_config_crd_api_version,
+ cluster_identity_resource_namespace: @@cluster_identity_resource_namespace,
+ cluster_identity_resource_name: @@cluster_identity_resource_name,
+ }
+ get_request = Net::HTTP::Get.new(crd_request_uri)
+ get_request["Authorization"] = "Bearer #{@service_account_token}"
+ @log.info "Making GET request to #{crd_request_uri} @ #{Time.now.utc.iso8601}"
+ get_response = @http_client.request(get_request)
+ @log.info "Got response of #{get_response.code} for #{crd_request_uri} @ #{Time.now.utc.iso8601}"
+ if get_response.code.to_i == 200
+ status = JSON.parse(get_response.body)["status"]
+ tokenReference["expirationTime"] = status["expirationTime"]
+ tokenReference["secretName"] = status["tokenReference"]["secretName"]
+ tokenReference["dataName"] = status["tokenReference"]["dataName"]
+ end
+ rescue => err
+ @log.warn "get_token_reference_from_crd call failed: #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ return tokenReference
+ end
+
+ private
+
+ def renew_near_expiry_token()
+ begin
+ crd_request_uri = @@crd_resource_uri_template % {
+ kube_api_server_url: @kube_api_server_url,
+ cluster_config_crd_api_version: @@cluster_config_crd_api_version,
+ cluster_identity_resource_namespace: @@cluster_identity_resource_namespace,
+ cluster_identity_resource_name: @@cluster_identity_resource_name,
+ }
+ crd_request_body = get_crd_request_body
+ crd_request_body_json = crd_request_body.to_json
+ update_request = Net::HTTP::Patch.new(crd_request_uri)
+ update_request["Content-Type"] = "application/merge-patch+json"
+ update_request["Authorization"] = "Bearer #{@service_account_token}"
+ update_request.body = crd_request_body_json
+ update_response = @http_client.request(update_request)
+ @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}"
+ if update_response.code.to_i == 404
+ @log.info "since crd resource doesnt exist since creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}"
+ create_request = Net::HTTP::Post.new(crd_request_uri)
+ create_request["Content-Type"] = "application/json"
+ create_request["Authorization"] = "Bearer #{@service_account_token}"
+ create_request.body = crd_request_body_json
+ create_response = @http_client.request(create_request)
+ @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}"
+ end
+ rescue => err
+ @log.warn "renew_near_expiry_token call failed: #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ end
+
+ private
+
+ def get_service_account_token()
+ begin
+ if File.exist?(@token_file_path) && File.readable?(@token_file_path)
+ token_str = File.read(@token_file_path).strip
+ return token_str
+ else
+ @log.warn "Unable to read token string from #{@token_file_path}"
+ return nil
+ end
+ rescue => err
+ @log.warn "get_service_account_token call failed: #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ end
+
+ private
+
+ def get_http_client()
+ begin
+ base_api_server_url = URI.parse(@kube_api_server_url)
+ http = Net::HTTP.new(base_api_server_url.host, base_api_server_url.port)
+ http.use_ssl = true
+ if !File.exist?(@cert_file_path)
+ raise "#{@cert_file_path} doesnt exist"
+ else
+ http.ca_file = @cert_file_path
+ end
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
+ return http
+ rescue => err
+ @log.warn "Unable to create http client #{err}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ return nil
+ end
+
+ private
+
+ def get_crd_request_body
+ body = {}
+ body["apiVersion"] = @@cluster_config_crd_api_version
+ body["kind"] = @@cluster_identity_request_kind
+ body["metadata"] = {}
+ body["metadata"]["name"] = @@cluster_identity_resource_name
+ body["metadata"]["namespace"] = @@cluster_identity_resource_namespace
+ body["spec"] = {}
+ body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience
+ return body
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/constants.rb b/source/plugins/ruby-fluentd4/constants.rb
new file mode 100644
index 000000000..cf41900dc
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/constants.rb
@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+
+class Constants
+ INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
+ INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
+ INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
+ INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
+ INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
+ INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
+ INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
+ INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
+ INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
+ INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
+ INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
+ INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
+ INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
+ INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
+ INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
+ INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
+ INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
+ INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
+ INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName"
+ INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
+ REASON_OOM_KILLED = "oomkilled"
+ #Kubestate (common)
+ INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime"
+ #Kubestate (deployments)
+ INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available"
+ #Kubestate (HPA)
+ INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName"
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas"
+
+ INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime"
+ # MDM Metric names
+ MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount"
+ MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount"
+ MDM_POD_READY_PERCENTAGE = "podReadyPercentage"
+ MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount"
+ MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage"
+ MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
+ MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
+ MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
+ MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
+ MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
+ MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
+ MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"
+
+ CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5
+ OBJECT_NAME_K8S_CONTAINER = "K8SContainer"
+ OBJECT_NAME_K8S_NODE = "K8SNode"
+ CPU_USAGE_NANO_CORES = "cpuUsageNanoCores"
+ CPU_USAGE_MILLI_CORES = "cpuUsageMillicores"
+ MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes"
+ MEMORY_RSS_BYTES = "memoryRssBytes"
+ PV_USED_BYTES = "pvUsedBytes"
+ DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
+ DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
+ DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
+ DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
+ CONTROLLER_KIND_JOB = "job"
+ CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
+ CONTAINER_STATE_TERMINATED = "terminated"
+ STALE_JOB_TIME_IN_MINUTES = 360
+ TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
+ OMSAGENT_ZERO_FILL = "omsagent"
+ KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"
+ VOLUME_NAME_ZERO_FILL = "-"
+ PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume",
+ "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs",
+ "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"]
+
+ #Telemetry constants
+ CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent"
+ POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent"
+ CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent"
+ PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent"
+ PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled"
+ PV_INVENTORY_HEART_BEAT_EVENT = "KubePVInventoryHeartBeatEvent"
+ TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10
+ KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
+ ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30
+ MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"
+ MDM_EXCEPTION_TELEMETRY_METRIC = "AKSCustomMetricsMdmExceptions"
+ MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL = 30
+
+ #Pod Statuses
+ POD_STATUS_TERMINATING = "Terminating"
+
+ AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME = "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2"
+end
diff --git a/source/plugins/ruby-fluentd4/in_kube_podinventory.rb b/source/plugins/ruby-fluentd4/in_kube_podinventory.rb
new file mode 100644
index 000000000..b6ed327d2
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/in_kube_podinventory.rb
@@ -0,0 +1,656 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require 'fluent/plugin/input'
+
+module Fluent::Plugin
+ require_relative "podinventory_to_mdm"
+
+ class Kube_PodInventory_Input < Input
+ Fluent::Plugin.register_input("kube_podinventory", self)
+
+ @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
+ @@hostName = (OMS::Common.get_hostname)
+ @@kubeperfTag = "oms.api.KubePerf"
+ @@kubeservicesTag = "oms.containerinsights.KubeServices"
+ @@containerinventoryTag = "oms.containerinsights.ContainerInventory"
+
+ def initialize
+ super
+ require "yaml"
+ require "yajl/json_gem"
+ require "yajl"
+ require "set"
+ require "time"
+
+ require_relative "kubernetes_container_inventory"
+ require_relative "KubernetesApiClient"
+ require_relative "ApplicationInsightsUtility"
+ require_relative "oms_common"
+ require_relative "omslog"
+ require_relative "constants"
+
+ # refer tomlparser-agent-config for updating defaults
+ # this configurable via configmap
+ @PODS_CHUNK_SIZE = 0
+ @PODS_EMIT_STREAM_BATCH_SIZE = 0
+
+ @podCount = 0
+ @serviceCount = 0
+ @controllerSet = Set.new []
+ @winContainerCount = 0
+ @controllerData = {}
+ @podInventoryE2EProcessingLatencyMs = 0
+ @podsAPIE2ELatencyMs = 0
+ end
+
+ config_param :run_interval, :time, :default => 60
+ config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory"
+
+ def configure(conf)
+ super
+ @inventoryToMdmConvertor = Inventory2MdmConvertor.new()
+ end
+
+ def start
+ if @run_interval
+ if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0
+ @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i
+ else
+ # this shouldnt happen just setting default here as safe guard
+ $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty")
+ @PODS_CHUNK_SIZE = 1000
+ end
+ $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}")
+
+ if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0
+ @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i
+ else
+ # this shouldnt happen just setting default here as safe guard
+ $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty")
+ @PODS_EMIT_STREAM_BATCH_SIZE = 200
+ end
+ $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
+
+ @finished = false
+ @condition = ConditionVariable.new
+ @mutex = Mutex.new
+ @thread = Thread.new(&method(:run_periodic))
+ @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
+ end
+ end
+
+ def shutdown
+ if @run_interval
+ @mutex.synchronize {
+ @finished = true
+ @condition.signal
+ }
+ @thread.join
+ end
+ end
+
+ def enumerate(podList = nil)
+ begin
+ podInventory = podList
+ telemetryFlush = false
+ @podCount = 0
+ @serviceCount = 0
+ @controllerSet = Set.new []
+ @winContainerCount = 0
+ @controllerData = {}
+ currentTime = Time.now
+ batchTime = currentTime.utc.iso8601
+ serviceRecords = []
+ @podInventoryE2EProcessingLatencyMs = 0
+ podInventoryStartTime = (Time.now.to_f * 1000).to_i
+ # Get services first so that we dont need to make a call for very chunk
+ $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}")
+ serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
+ # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body)
+ $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}")
+
+ if !serviceInfo.nil?
+ $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+ serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
+ $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+ serviceInfo = nil
+ # service inventory records much smaller and fixed size compared to serviceList
+ serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime)
+ # updating for telemetry
+ @serviceCount += serviceRecords.length
+ serviceList = nil
+ end
+
+ # to track e2e processing latency
+ @podsAPIE2ELatencyMs = 0
+ podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
+ # Initializing continuation token to nil
+ continuationToken = nil
+ $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
+ $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
+ @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
+ if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
+ else
+ $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
+ end
+
+ #If we receive a continuation token, make calls, process and flush data until we have processed all data
+ while (!continuationToken.nil? && !continuationToken.empty?)
+ podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
+ continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
+ podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
+ @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime)
+ if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
+ else
+ $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
+ end
+ end
+
+ @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime)
+ # Setting these to nil so that we dont hold memory until GC kicks in
+ podInventory = nil
+ serviceRecords = nil
+
+ # Adding telemetry to send pod telemetry every 5 minutes
+ timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
+ timeDifferenceInMinutes = timeDifference / 60
+ if (timeDifferenceInMinutes >= 5)
+ telemetryFlush = true
+ end
+
+ # Flush AppInsights telemetry once all the processing is done
+ if telemetryFlush == true
+ telemetryProperties = {}
+ telemetryProperties["Computer"] = @@hostName
+ telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE
+ telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE
+ ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
+ ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
+ ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {})
+ telemetryProperties["ControllerData"] = @controllerData.to_json
+ ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties)
+ if @winContainerCount > 0
+ telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount
+ ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties)
+ end
+ ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties)
+ ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyMs", @podsAPIE2ELatencyMs, telemetryProperties)
+ @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
+ end
+ rescue => errorStr
+ $log.warn "in_kube_podinventory::enumerate:Failed in enumerate: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601)
+ currentTime = Time.now
+ emitTime = currentTime.to_f
+ #batchTime = currentTime.utc.iso8601
+ eventStream = Fluent::MultiEventStream.new
+ containerInventoryStream = Fluent::MultiEventStream.new
+ kubePerfEventStream = Fluent::MultiEventStream.new
+ insightsMetricsEventStream = Fluent::MultiEventStream.new
+ @@istestvar = ENV["ISTEST"]
+
+ begin #begin block start
+ # Getting windows nodes from kubeapi
+ winNodes = KubernetesApiClient.getWindowsNodesArray
+ podInventory["items"].each do |item| #podInventory block start
+ # pod inventory records
+ podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime)
+ podInventoryRecords.each do |record|
+ if !record.nil?
+ eventStream.add(Fluent::Engine.now, record)
+ wrapper = {
+ "DataType" => "KUBE_POD_INVENTORY_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [record.each { |k, v| record[k] = v }],
+ }
+ @inventoryToMdmConvertor.process_pod_inventory_record(wrapper)
+ end
+ end
+ # Setting this flag to true so that we can send ContainerInventory records for containers
+ # on windows nodes and parse environment variables for these containers
+ if winNodes.length > 0
+ nodeName = ""
+ if !item["spec"]["nodeName"].nil?
+ nodeName = item["spec"]["nodeName"]
+ end
+ if (!nodeName.empty? && (winNodes.include? nodeName))
+ clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
+ #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
+ containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true)
+ # Send container inventory records for containers on windows nodes
+ @winContainerCount += containerInventoryRecords.length
+ containerInventoryRecords.each do |cirecord|
+ if !cirecord.nil?
+ ciwrapper = {
+ "DataType" => "CONTAINER_INVENTORY_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }],
+ }
+ containerInventoryStream.add(Fluent::Engine.now, ciwrapper)
+ end
+ end
+ end
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ router.emit_stream(@tag, eventStream) if eventStream
+ eventStream = Fluent::MultiEventStream.new
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && containerInventoryStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of windows container inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@containerinventoryTag, eventStream) if eventStream
+ containerInventoryStream = Fluent::MultiEventStream.new
+ end
+
+ #container perf records
+ containerMetricDataItems = []
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime))
+
+ containerMetricDataItems.each do |record|
+ record["DataType"] = "LINUX_PERF_BLOB"
+ record["IPName"] = "LogManagement"
+ kubePerfEventStream.add(Fluent::Engine.now, record) if record
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+ kubePerfEventStream = Fluent::MultiEventStream.new
+ end
+
+ # container GPU records
+ containerGPUInsightsMetricsDataItems = []
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime))
+ containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ wrapper = {
+ "DataType" => "INSIGHTS_METRICS_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+ }
+ insightsMetricsEventStream.add(Fluent::Engine.now, wrapper)
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ insightsMetricsEventStream = Fluent::MultiEventStream.new
+ end
+ end #podInventory block end
+
+ if eventStream.count > 0
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@tag, eventStream) if eventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ eventStream = nil
+ end
+
+ if containerInventoryStream.count > 0
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of container inventory records emitted #{containerInventoryStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@containerinventoryTag, containerInventoryStream) if containerInventoryStream
+ containerInventoryStream = nil
+ end
+
+ if kubePerfEventStream.count > 0
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+ kubePerfEventStream = nil
+ end
+
+ if insightsMetricsEventStream.count > 0
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ insightsMetricsEventStream = nil
+ end
+
+ if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
+ @log.info "Sending pod inventory mdm records to out_mdm"
+ pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
+ @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
+ mdm_pod_inventory_es = Fluent::MultiEventStream.new
+ pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
+ mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
+ } if pod_inventory_mdm_records
+ router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
+ end
+
+ if continuationToken.nil? # sending kube services inventory records
+ kubeServicesEventStream = Fluent::MultiEventStream.new
+ serviceRecords.each do |kubeServiceRecord|
+ if !kubeServiceRecord.nil?
+ # adding before emit to reduce memory foot print
+ kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId
+ kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName
+ kubeServicesEventStream.add(Fluent::Engine.now, kubeServiceRecord)
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream
+ kubeServicesEventStream = Fluent::MultiEventStream.new
+ end
+ end
+ end
+
+ if kubeServicesEventStream.count > 0
+ $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream
+ end
+ kubeServicesEventStream = nil
+ end
+
+ #Updating value for AppInsights telemetry
+ @podCount += podInventory["items"].length
+ rescue => errorStr
+ $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end #begin block end
+ end
+
+ def run_periodic
+ @mutex.lock
+ done = @finished
+ @nextTimeToRun = Time.now
+ @waitTimeout = @run_interval
+ until done
+ @nextTimeToRun = @nextTimeToRun + @run_interval
+ @now = Time.now
+ if @nextTimeToRun <= @now
+ @waitTimeout = 1
+ @nextTimeToRun = @now
+ else
+ @waitTimeout = @nextTimeToRun - @now
+ end
+ @condition.wait(@mutex, @waitTimeout)
+ done = @finished
+ @mutex.unlock
+ if !done
+ begin
+ $log.info("in_kube_podinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}")
+ enumerate
+ $log.info("in_kube_podinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn "in_kube_podinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ @mutex.lock
+ end
+ @mutex.unlock
+ end
+
+ # TODO - move this method to KubernetesClient or helper class
+ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
+ records = []
+ record = {}
+
+ begin
+ record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated
+ record["Name"] = item["metadata"]["name"]
+ podNameSpace = item["metadata"]["namespace"]
+ podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"])
+ if podUid.nil?
+ return records
+ end
+
+ nodeName = ""
+ #for unscheduled (non-started) pods nodeName does NOT exist
+ if !item["spec"]["nodeName"].nil?
+ nodeName = item["spec"]["nodeName"]
+ end
+ # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes
+ if KubernetesApiClient.isAROv3MasterOrInfraPod(nodeName)
+ return records
+ end
+
+ record["PodUid"] = podUid
+ record["PodLabel"] = [item["metadata"]["labels"]]
+ record["Namespace"] = podNameSpace
+ record["PodCreationTimeStamp"] = item["metadata"]["creationTimestamp"]
+ #for unscheduled (non-started) pods startTime does NOT exist
+ if !item["status"]["startTime"].nil?
+ record["PodStartTime"] = item["status"]["startTime"]
+ else
+ record["PodStartTime"] = ""
+ end
+ #podStatus
+ # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running
+ podReadyCondition = true
+ if !item["status"]["reason"].nil? && item["status"]["reason"] == "NodeLost" && !item["status"]["conditions"].nil?
+ item["status"]["conditions"].each do |condition|
+ if condition["type"] == "Ready" && condition["status"] == "False"
+ podReadyCondition = false
+ break
+ end
+ end
+ end
+ if podReadyCondition == false
+ record["PodStatus"] = "Unknown"
+ # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home
+ elsif !item["metadata"]["deletionTimestamp"].nil? && !item["metadata"]["deletionTimestamp"].empty?
+ record["PodStatus"] = Constants::POD_STATUS_TERMINATING
+ else
+ record["PodStatus"] = item["status"]["phase"]
+ end
+ #for unscheduled (non-started) pods podIP does NOT exist
+ if !item["status"]["podIP"].nil?
+ record["PodIp"] = item["status"]["podIP"]
+ else
+ record["PodIp"] = ""
+ end
+
+ record["Computer"] = nodeName
+ record["ClusterId"] = KubernetesApiClient.getClusterId
+ record["ClusterName"] = KubernetesApiClient.getClusterName
+ record["ServiceName"] = getServiceNameFromLabels(item["metadata"]["namespace"], item["metadata"]["labels"], serviceRecords)
+
+ if !item["metadata"]["ownerReferences"].nil?
+ record["ControllerKind"] = item["metadata"]["ownerReferences"][0]["kind"]
+ record["ControllerName"] = item["metadata"]["ownerReferences"][0]["name"]
+ @controllerSet.add(record["ControllerKind"] + record["ControllerName"])
+ #Adding controller kind to telemetry ro information about customer workload
+ if (@controllerData[record["ControllerKind"]].nil?)
+ @controllerData[record["ControllerKind"]] = 1
+ else
+ controllerValue = @controllerData[record["ControllerKind"]]
+ @controllerData[record["ControllerKind"]] += 1
+ end
+ end
+ podRestartCount = 0
+ record["PodRestartCount"] = 0
+
+ #Invoke the helper method to compute ready/not ready mdm metric
+ @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
+
+ podContainers = []
+ if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty?
+ podContainers = podContainers + item["status"]["containerStatuses"]
+ end
+ # Adding init containers to the record list as well.
+ if item["status"].key?("initContainerStatuses") && !item["status"]["initContainerStatuses"].empty?
+ podContainers = podContainers + item["status"]["initContainerStatuses"]
+ end
+ # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start
+ if !podContainers.empty? #container status block start
+ podContainers.each do |container|
+ containerRestartCount = 0
+ lastFinishedTime = nil
+ # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart
+ #container Id is of the form
+ #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527
+ if !container["containerID"].nil?
+ record["ContainerID"] = container["containerID"].split("//")[1]
+ else
+ # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0
+ record["ContainerID"] = ""
+ end
+ #keeping this as which is same as InstanceName in perf table
+ if podUid.nil? || container["name"].nil?
+ next
+ else
+ record["ContainerName"] = podUid + "/" + container["name"]
+ end
+ #Pod restart count is a sumtotal of restart counts of individual containers
+ #within the pod. The restart count of a container is maintained by kubernetes
+ #itself in the form of a container label.
+ containerRestartCount = container["restartCount"]
+ record["ContainerRestartCount"] = containerRestartCount
+
+ containerStatus = container["state"]
+ record["ContainerStatusReason"] = ""
+ # state is of the following form , so just picking up the first key name
+ # "state": {
+ # "waiting": {
+ # "reason": "CrashLoopBackOff",
+ # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)"
+ # }
+ # },
+ # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running
+ if podReadyCondition == false
+ record["ContainerStatus"] = "Unknown"
+ else
+ record["ContainerStatus"] = containerStatus.keys[0]
+ end
+ #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric
+ #Picking up both container and node start time from cAdvisor to be consistent
+ if containerStatus.keys[0] == "running"
+ record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"]
+ else
+ if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty?
+ record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"]
+ end
+ # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm
+ if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB
+ @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus)
+ end
+ end
+
+ # Record the last state of the container. This may have information on why a container was killed.
+ begin
+ if !container["lastState"].nil? && container["lastState"].keys.length == 1
+ lastStateName = container["lastState"].keys[0]
+ lastStateObject = container["lastState"][lastStateName]
+ if !lastStateObject.is_a?(Hash)
+ raise "expected a hash object. This could signify a bug or a kubernetes API change"
+ end
+
+ if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
+ newRecord = Hash.new
+ newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated)
+ lastStateReason = lastStateObject["reason"]
+ # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled)
+ newRecord["reason"] = lastStateReason # (ex: OOMKilled)
+ newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z)
+ lastFinishedTime = lastStateObject["finishedAt"]
+ newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z)
+
+ # only write to the output field if everything previously ran without error
+ record["ContainerLastStatus"] = newRecord
+
+ #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
+ if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+ @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ end
+ lastStateReason = nil
+ else
+ record["ContainerLastStatus"] = Hash.new
+ end
+ else
+ record["ContainerLastStatus"] = Hash.new
+ end
+
+ #Populate mdm metric for container restart count if greater than 0
+ if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+ @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ end
+ rescue => errorStr
+ $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ record["ContainerLastStatus"] = Hash.new
+ end
+
+ podRestartCount += containerRestartCount
+ records.push(record.dup)
+ end
+ else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod
+ records.push(record)
+ end #container status block end
+
+ records.each do |record|
+ if !record.nil?
+ record["PodRestartCount"] = podRestartCount
+ end
+ end
+ rescue => error
+ $log.warn("getPodInventoryRecords failed: #{error}")
+ end
+ return records
+ end
+
+ # TODO - move this method to KubernetesClient or helper class
+ def getServiceNameFromLabels(namespace, labels, serviceRecords)
+ serviceName = ""
+ begin
+ if !labels.nil? && !labels.empty?
+ serviceRecords.each do |kubeServiceRecord|
+ found = 0
+ if kubeServiceRecord["Namespace"] == namespace
+ selectorLabels = {}
+ # selector labels wrapped in array in kube service records so unwrapping here
+ if !kubeServiceRecord["SelectorLabels"].nil? && kubeServiceRecord["SelectorLabels"].length > 0
+ selectorLabels = kubeServiceRecord["SelectorLabels"][0]
+ end
+ if !selectorLabels.nil? && !selectorLabels.empty?
+ selectorLabels.each do |key, value|
+ if !(labels.select { |k, v| k == key && v == value }.length > 0)
+ break
+ end
+ found = found + 1
+ end
+ # service can have no selectors
+ if found == selectorLabels.length
+ return kubeServiceRecord["ServiceName"]
+ end
+ end
+ end
+ end
+ end
+ rescue => errorStr
+ $log.warn "Failed to retrieve service name from labels: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return serviceName
+ end
+ end # Kube_Pod_Input
+end # module
diff --git a/source/plugins/ruby-fluentd4/kubernetes_container_inventory.rb b/source/plugins/ruby-fluentd4/kubernetes_container_inventory.rb
new file mode 100644
index 000000000..69beca493
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/kubernetes_container_inventory.rb
@@ -0,0 +1,360 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class KubernetesContainerInventory
+ require "yajl/json_gem"
+ require "time"
+ require "json"
+ require_relative "omslog"
+ require_relative "ApplicationInsightsUtility"
+
+ # cache the container and cgroup parent process
+ @@containerCGroupCache = Hash.new
+
+ def initialize
+ end
+
+ class << self
+ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVar, isWindows = false)
+ containerInventoryRecords = Array.new
+ begin
+ containersInfoMap = getContainersInfoMap(podItem, isWindows)
+ podContainersStatuses = []
+ if !podItem["status"]["containerStatuses"].nil? && !podItem["status"]["containerStatuses"].empty?
+ podContainersStatuses = podItem["status"]["containerStatuses"]
+ end
+ if !podItem["status"]["initContainerStatuses"].nil? && !podItem["status"]["initContainerStatuses"].empty?
+ podContainersStatuses = podContainersStatuses + podItem["status"]["initContainerStatuses"]
+ end
+
+ if !podContainersStatuses.empty?
+ podContainersStatuses.each do |containerStatus|
+ containerInventoryRecord = {}
+ containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated
+ containerName = containerStatus["name"]
+ # containeId format is ://
+ containerRuntime = ""
+ containerId = ""
+ if !containerStatus["containerID"].nil?
+ containerRuntime = containerStatus["containerID"].split(":")[0]
+ containerId = containerStatus["containerID"].split("//")[1]
+ containerInventoryRecord["InstanceID"] = containerId
+ else
+ # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0
+ containerInventoryRecord["InstanceID"] = containerId
+ end
+ # imagedId is of the format - repo@sha256:imageid
+ imageIdValue = containerStatus["imageID"]
+ if !imageIdValue.nil? && !imageIdValue.empty?
+ atLocation = imageIdValue.index("@")
+ if !atLocation.nil?
+ containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1]
+ end
+ end
+ # image is of the format - repository/image:imagetag
+ imageValue = containerStatus["image"]
+ if !imageValue.nil? && !imageValue.empty?
+ # Find delimiters in the string of format repository/image:imagetag
+ slashLocation = imageValue.index("/")
+ colonLocation = imageValue.index(":")
+ if !colonLocation.nil?
+ if slashLocation.nil?
+ # image:imagetag
+ containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
+ else
+ # repository/image:imagetag
+ containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
+ containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)]
+ end
+ containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1]
+ end
+ elsif !imageIdValue.nil? && !imageIdValue.empty?
+ # Getting repo information from imageIdValue when no tag in ImageId
+ if !atLocation.nil?
+ containerInventoryRecord["Repository"] = imageIdValue[0..(atLocation - 1)]
+ end
+ end
+ containerInventoryRecord["ExitCode"] = 0
+ isContainerTerminated = false
+ isContainerWaiting = false
+ if !containerStatus["state"].nil? && !containerStatus["state"].empty?
+ containerState = containerStatus["state"]
+ if containerState.key?("running")
+ containerInventoryRecord["State"] = "Running"
+ containerInventoryRecord["StartedTime"] = containerState["running"]["startedAt"]
+ elsif containerState.key?("terminated")
+ containerInventoryRecord["State"] = "Terminated"
+ containerInventoryRecord["StartedTime"] = containerState["terminated"]["startedAt"]
+ containerInventoryRecord["FinishedTime"] = containerState["terminated"]["finishedAt"]
+ exitCodeValue = containerState["terminated"]["exitCode"]
+ if exitCodeValue < 0
+ exitCodeValue = 128
+ end
+ containerInventoryRecord["ExitCode"] = exitCodeValue
+ if exitCodeValue > 0
+ containerInventoryRecord["State"] = "Failed"
+ end
+ isContainerTerminated = true
+ elsif containerState.key?("waiting")
+ containerInventoryRecord["State"] = "Waiting"
+ isContainerWaiting = true
+ end
+ end
+
+ restartCount = 0
+ if !containerStatus["restartCount"].nil?
+ restartCount = containerStatus["restartCount"]
+ end
+
+ containerInfoMap = containersInfoMap[containerName]
+ podName = containerInfoMap["PodName"]
+ namespace = containerInfoMap["Namespace"]
+ # containername in the format what docker sees
+ containerNameInDockerFormat = "k8s_#{containerName}_#{podName}_#{namespace}_#{containerId}_#{restartCount}"
+ containerInventoryRecord["ElementName"] = containerNameInDockerFormat
+ containerInventoryRecord["Computer"] = containerInfoMap["Computer"]
+ containerInventoryRecord["ContainerHostname"] = podName
+ containerInventoryRecord["CreatedTime"] = containerInfoMap["CreatedTime"]
+ containerInventoryRecord["EnvironmentVar"] = containerInfoMap["EnvironmentVar"]
+ containerInventoryRecord["Ports"] = containerInfoMap["Ports"]
+ containerInventoryRecord["Command"] = containerInfoMap["Command"]
+ if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0
+ containerInventoryRecord["EnvironmentVar"] = ["AZMON_CLUSTER_COLLECT_ENV_VAR=FALSE"]
+ elsif isWindows || isContainerTerminated || isContainerWaiting
+ # for terminated and waiting containers, since the cproc doesnt exist we lost the env and we can only get this
+ containerInventoryRecord["EnvironmentVar"] = containerInfoMap["EnvironmentVar"]
+ else
+ if containerId.nil? || containerId.empty? || containerRuntime.nil? || containerRuntime.empty?
+ containerInventoryRecord["EnvironmentVar"] = ""
+ else
+ if containerRuntime.casecmp("cri-o") == 0
+ # crio containers have conmon as parent process and we only to need get container main process envvars
+ containerInventoryRecord["EnvironmentVar"] = obtainContainerEnvironmentVars("crio-#{containerId}")
+ else
+ containerInventoryRecord["EnvironmentVar"] = obtainContainerEnvironmentVars(containerId)
+ end
+ end
+ end
+ containerInventoryRecords.push containerInventoryRecord
+ end
+ end
+ rescue => error
+ $log.warn("KubernetesContainerInventory::getContainerInventoryRecords : Get Container Inventory Records failed: #{error}")
+ $log.debug_backtrace(error.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ return containerInventoryRecords
+ end
+
+ def getContainersInfoMap(podItem, isWindows)
+ containersInfoMap = {}
+ begin
+ nodeName = (!podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
+ createdTime = podItem["metadata"]["creationTimestamp"]
+ podName = podItem["metadata"]["name"]
+ namespace = podItem["metadata"]["namespace"]
+ if !podItem.nil? && !podItem.empty? && podItem.key?("spec") && !podItem["spec"].nil? && !podItem["spec"].empty?
+ podContainers = []
+ if !podItem["spec"]["containers"].nil? && !podItem["spec"]["containers"].empty?
+ podContainers = podItem["spec"]["containers"]
+ end
+ if !podItem["spec"]["initContainers"].nil? && !podItem["spec"]["initContainers"].empty?
+ podContainers = podContainers + podItem["spec"]["initContainers"]
+ end
+ if !podContainers.empty?
+ podContainers.each do |container|
+ containerInfoMap = {}
+ containerName = container["name"]
+ containerInfoMap["ElementName"] = containerName
+ containerInfoMap["Computer"] = nodeName
+ containerInfoMap["PodName"] = podName
+ containerInfoMap["Namespace"] = namespace
+ containerInfoMap["CreatedTime"] = createdTime
+ portsValue = container["ports"]
+ portsValueString = (portsValue.nil?) ? "" : portsValue.to_s
+ containerInfoMap["Ports"] = portsValueString
+ cmdValue = container["command"]
+ cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s
+ containerInfoMap["Command"] = cmdValueString
+ containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+ containersInfoMap[containerName] = containerInfoMap
+ end
+ end
+ end
+ rescue => error
+ $log.warn("KubernetesContainerInventory::getContainersInfoMap : Get Container Info Maps failed: #{error}")
+ $log.debug_backtrace(error.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ return containersInfoMap
+ end
+
+ def obtainContainerEnvironmentVars(containerId)
+ envValueString = ""
+ begin
+ isCGroupPidFetchRequired = false
+ if !@@containerCGroupCache.has_key?(containerId)
+ isCGroupPidFetchRequired = true
+ else
+ cGroupPid = @@containerCGroupCache[containerId]
+ if cGroupPid.nil? || cGroupPid.empty?
+ isCGroupPidFetchRequired = true
+ @@containerCGroupCache.delete(containerId)
+ elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
+ isCGroupPidFetchRequired = true
+ @@containerCGroupCache.delete(containerId)
+ end
+ end
+
+ if isCGroupPidFetchRequired
+ Dir["/hostfs/proc/*/cgroup"].each do |filename|
+ begin
+ if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any?
+ # file full path is /hostfs/proc//cgroup
+ cGroupPid = filename.split("/")[3]
+ if is_number?(cGroupPid)
+ if @@containerCGroupCache.has_key?(containerId)
+ tempCGroupPid = @@containerCGroupCache[containerId]
+ if tempCGroupPid.to_i > cGroupPid.to_i
+ @@containerCGroupCache[containerId] = cGroupPid
+ end
+ else
+ @@containerCGroupCache[containerId] = cGroupPid
+ end
+ end
+ end
+ rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
+ end
+ end
+ end
+ cGroupPid = @@containerCGroupCache[containerId]
+ if !cGroupPid.nil? && !cGroupPid.empty?
+ environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
+ if File.exist?(environFilePath)
+ # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
+ # Check to see if the environment variable collection is disabled for this container.
+ if File.foreach(environFilePath).grep(/AZMON_COLLECT_ENV=FALSE/i).any?
+ envValueString = ["AZMON_COLLECT_ENV=FALSE"]
+ $log.warn("Environment Variable collection for container: #{containerId} skipped because AZMON_COLLECT_ENV is set to false")
+ else
+ # Restricting the ENV string value to 200kb since the size of this string can go very high
+ envVars = File.read(environFilePath, 200000)
+ if !envVars.nil? && !envVars.empty?
+ envVars = envVars.split("\0")
+ envValueString = envVars.to_json
+ envValueStringLength = envValueString.length
+ if envValueStringLength >= 200000
+ lastIndex = envValueString.rindex("\",")
+ if !lastIndex.nil?
+ envValueStringTruncated = envValueString.slice(0..lastIndex) + "]"
+ envValueString = envValueStringTruncated
+ end
+ end
+ end
+ end
+ end
+ else
+ $log.warn("KubernetesContainerInventory::obtainContainerEnvironmentVars: cGroupPid is NIL or empty for containerId: #{containerId}")
+ end
+ rescue => error
+ $log.warn("KubernetesContainerInventory::obtainContainerEnvironmentVars: obtain Container Environment vars failed: #{error} for containerId: #{containerId}")
+ $log.debug_backtrace(error.backtrace)
+ end
+ return envValueString
+ end
+
+ def obtainContainerEnvironmentVarsFromPodsResponse(pod, container)
+ envValueString = ""
+ begin
+ envVars = []
+ envVarsJSON = container["env"]
+ if !pod.nil? && !pod.empty? && !envVarsJSON.nil? && !envVarsJSON.empty?
+ envVarsJSON.each do |envVar|
+ key = envVar["name"]
+ value = ""
+ if !envVar["value"].nil?
+ value = envVar["value"]
+ elsif !envVar["valueFrom"].nil?
+ valueFrom = envVar["valueFrom"]
+ # https://kubernetes.io/docs/tasks/inject-data-application/environment-variable-expose-pod-information/#use-pod-fields-as-values-for-environment-variables
+ if valueFrom.key?("fieldRef") && !valueFrom["fieldRef"]["fieldPath"].nil? && !valueFrom["fieldRef"]["fieldPath"].empty?
+ fieldPath = valueFrom["fieldRef"]["fieldPath"]
+ fields = fieldPath.split(".")
+ if fields.length() == 2
+ if !fields[1].nil? && !fields[1].empty? && fields[1].end_with?("]")
+ indexFields = fields[1].split("[")
+ hashMapValue = pod[fields[0]][indexFields[0]]
+ if !hashMapValue.nil? && !hashMapValue.empty?
+ subField = indexFields[1].chomp("]").delete("\\'")
+ value = hashMapValue[subField]
+ end
+ else
+ value = pod[fields[0]][fields[1]]
+ end
+ end
+ # https://kubernetes.io/docs/tasks/inject-data-application/environment-variable-expose-pod-information/#use-container-fields-as-values-for-environment-variables
+ elsif valueFrom.key?("resourceFieldRef") && !valueFrom["resourceFieldRef"]["resource"].nil? && !valueFrom["resourceFieldRef"]["resource"].empty?
+ resource = valueFrom["resourceFieldRef"]["resource"]
+ resourceFields = resource.split(".")
+ containerResources = container["resources"]
+ if !containerResources.nil? && !containerResources.empty? && resourceFields.length() == 2
+ value = containerResources[resourceFields[0]][resourceFields[1]]
+ end
+ # https://kubernetes.io/docs/concepts/configuration/secret/#using-secrets-as-environment-variables
+ elsif valueFrom.key?("secretKeyRef")
+ secretName = valueFrom["secretKeyRef"]["name"]
+ secretKey = valueFrom["secretKeyRef"]["key"]
+ # This is still secret not the plaintext. Flatten value so that CI Ux can show that
+ if !secretName.nil? && !secretName.empty? && !secretKey.nil? && !secretKey.empty?
+ value = "secretKeyRef_#{secretName}_#{secretKey}"
+ end
+ else
+ value = envVar["valueFrom"].to_s
+ end
+ end
+ envVars.push("#{key}=#{value}")
+ end
+ envValueString = envVars.to_json
+ containerName = container["name"]
+ # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
+ # Check to see if the environment variable collection is disabled for this container.
+ if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString)
+ envValueString = ["AZMON_COLLECT_ENV=FALSE"]
+ $log.warn("Environment Variable collection for container: #{containerName} skipped because AZMON_COLLECT_ENV is set to false")
+ else
+ # Restricting the ENV string value to 200kb since the size of this string can go very high
+ if envValueString.length > 200000
+ envValueStringTruncated = envValueString.slice(0..200000)
+ lastIndex = envValueStringTruncated.rindex("\",")
+ if !lastIndex.nil?
+ envValueString = envValueStringTruncated.slice(0..lastIndex) + "]"
+ else
+ envValueString = envValueStringTruncated
+ end
+ end
+ end
+ end
+ rescue => error
+ $log.warn("KubernetesContainerInventory::obtainContainerEnvironmentVarsFromPodsResponse: parsing of EnvVars failed: #{error}")
+ $log.debug_backtrace(error.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ return envValueString
+ end
+
+ def deleteCGroupCacheEntryForDeletedContainer(containerId)
+ begin
+ if !containerId.nil? && !containerId.empty? && !@@containerCGroupCache.nil? && @@containerCGroupCache.length > 0 && @@containerCGroupCache.key?(containerId)
+ @@containerCGroupCache.delete(containerId)
+ end
+ rescue => error
+ $log.warn("KubernetesContainerInventory::deleteCGroupCacheEntryForDeletedContainer: deleting of cache entry failed: #{error}")
+ $log.debug_backtrace(error.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ end
+ def is_number?(value)
+ true if Integer(value) rescue false
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights.rb b/source/plugins/ruby-fluentd4/lib/application_insights.rb
new file mode 100644
index 000000000..0a683d484
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights.rb
@@ -0,0 +1,9 @@
+require_relative 'application_insights/telemetry_client'
+require_relative 'application_insights/unhandled_exception'
+require_relative 'application_insights/version'
+
+module ApplicationInsights
+ module Rack
+ autoload :TrackRequest, "application_insights/rack/track_request"
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_queue.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_queue.rb
new file mode 100644
index 000000000..333f6968b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_queue.rb
@@ -0,0 +1,58 @@
+require_relative 'event'
+require_relative 'queue_base'
+
+module ApplicationInsights
+ module Channel
+ # An asynchronous queue for use in conjunction with the {AsynchronousSender}.
+ # The queue will notify the sender that it needs to pick up items when it
+ # reaches {#max_queue_length}, or when the consumer calls {#flush} via the
+ # {#flush_notification} event.
+ #
+ # @example
+ # require 'application_insights'
+ # require 'thread'
+ # queue = ApplicationInsights::Channel::AsynchronousQueue.new nil
+ # Thread.new do
+ # sleep 1
+ # queue.push 1
+ # queue.flush
+ # end
+ # queue.flush_notification.wait
+ # queue.flush_notification.clear
+ # result = queue.pop
+ class AsynchronousQueue < QueueBase
+ # Initializes a new instance of the class.
+ # @param [SenderBase] sender the sender object that will be used in
+ # conjunction with this queue. In addition to the sender object must
+ # support a {AsynchronousSender#start} method which is invoked each time
+ # an item is pushed to the queue as well as use the {#flush_notification}
+ # event.
+ def initialize(sender)
+ @flush_notification = Event.new
+ super sender
+ end
+
+ # The flush notification {ApplicationInsights::Channel::Event} that the {#sender}
+ # will use to get notified that a flush is needed.
+ # @return [Event] object that the {#sender} can wait on.
+ attr_reader :flush_notification
+
+ # Adds the passed in item object to the queue and notifies the {#sender}
+ # to start an asynchronous send operation
+ # by calling {AsynchronousSender#start}.
+ # @param [Contracts::Envelope] item the telemetry envelope object to send
+ # to the service.
+ def push(item)
+ super item
+ @sender.start if @sender
+ end
+
+ # Flushes the current queue by notifying the {#sender} via the
+ # {#flush_notification} event.
+ def flush
+ @flush_notification.set
+ @sender.start if @sender
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_sender.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_sender.rb
new file mode 100644
index 000000000..4786aa1d9
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/asynchronous_sender.rb
@@ -0,0 +1,138 @@
+require_relative 'sender_base'
+require 'thread'
+
+module ApplicationInsights
+ module Channel
+ # An asynchronous sender that works in conjunction with the {AsynchronousQueue}.
+ # The sender object will start a worker thread that will pull items from the
+ # {#queue}. The thread will be created when the client calls {#start} and
+ # will check for queue items every {#send_interval} seconds. The worker thread
+ # can also be forced to check the queue by setting the
+ # {AsynchronousQueue#flush_notification} event.
+ #
+ # - If no items are found, the thread will go back to sleep.
+ # - If items are found, the worker thread will send items to the specified
+ # service in batches of {#send_buffer_size}.
+ #
+ # If no queue items are found for {#send_time} seconds, the worker thread
+ # will shut down (and {#start} will need to be called again).
+ class AsynchronousSender < SenderBase
+ SERVICE_ENDPOINT_URI = 'https://dc.services.visualstudio.com/v2/track'
+ # Initializes a new instance of the class.
+ # @param [String] service_endpoint_uri the address of the service to send
+ # @param [Hash] proxy server configuration to send (optional)
+ # telemetry data to.
+ def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {})
+ # callers which requires proxy dont require to maintain service endpoint uri which potentially can change
+ if service_endpoint_uri.nil? || service_endpoint_uri.empty?
+ service_endpoint_uri = SERVICE_ENDPOINT_URI
+ end
+ @send_interval = 1.0
+ @send_remaining_time = 0
+ @send_time = 3.0
+ @lock_work_thread = Mutex.new
+ @work_thread = nil
+ @start_notification_processed = true
+ super service_endpoint_uri, proxy
+ end
+
+ # The time span in seconds at which the the worker thread will check the
+ # {#queue} for items (defaults to: 1.0).
+ # @return [Fixnum] the interval in seconds.
+ attr_accessor :send_interval
+
+ # The time span in seconds for which the worker thread will stay alive if
+ # no items are found in the {#queue} (defaults to 3.0).
+ # @return [Fixnum] the interval in seconds.
+ attr_accessor :send_time
+
+ # The worker thread which checks queue items and send data every
+ # (#send_interval) seconds or upon flush.
+ # @return [Thread] the work thread
+ attr_reader :work_thread
+
+ # Calling this method will create a worker thread that checks the {#queue}
+ # every {#send_interval} seconds for a total duration of {#send_time}
+ # seconds for new items. If a worker thread has already been created,
+ # calling this method does nothing.
+ def start
+ @start_notification_processed = false
+ # Maintain one working thread at one time
+ unless @work_thread
+ @lock_work_thread.synchronize do
+ unless @work_thread
+ local_send_interval = [@send_interval, 0.1].max
+ @send_remaining_time = [@send_time, local_send_interval].max
+ @work_thread = Thread.new { run }
+ @work_thread.abort_on_exception = false
+ end
+ end
+ end
+ end
+
+ private
+
+ def run
+ # save the queue locally
+ local_queue = @queue
+ if local_queue.nil?
+ @work_thread = nil
+ return
+ end
+
+ begin
+ # fix up the send interval (can't be lower than 100ms)
+ local_send_interval = [@send_interval, 0.1].max
+
+ while true
+ @start_notification_processed = true
+ while true
+ # get at most @send_buffer_size items from the queue
+ data = []
+ @send_buffer_size.downto(1) do
+ item = local_queue.pop
+ break if not item
+ data.push item
+ end
+
+ # if we didn't get any items from the queue, we're done here
+ break if data.length == 0
+
+ # reset the send time
+ @send_remaining_time = @send_time
+
+ # finally send the data
+ send data
+ end
+
+ # wait at most @send_interval ms (or until we get signalled)
+ result = local_queue.flush_notification.wait local_send_interval
+ if result
+ local_queue.flush_notification.clear
+ next
+ end
+
+ # decrement the remaining time
+ @send_remaining_time -= local_send_interval
+ # If remaining time <=0 and there is no start notification unprocessed,
+ # then stop the working thread
+ if @send_remaining_time <= 0 && @start_notification_processed
+ # Note: there is still a chance some start notification could be
+ # missed, e.g., the start method got triggered between the above and
+ # following line. However the data is not lost as it would be
+ # processed later when next start notification comes after the worker
+ # thread stops. The cost to ensure no notification miss is high where
+ # a lock is required each time the start method calls.
+ @work_thread = nil
+ break
+ end
+ end
+ rescue Exception => e
+ # Make sure work_thread sets to nil when it terminates abnormally
+ @work_thread = nil
+ @logger.error('application_insights') { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" }
+ end
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/application.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/application.rb
new file mode 100644
index 000000000..071c37385
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/application.rb
@@ -0,0 +1,13 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Application
+ include JsonSerializable
+
+ attr_accessor :ver
+
+ attribute_mapping(
+ ver: 'ai.application.ver'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/availability_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/availability_data.rb
new file mode 100644
index 000000000..d560dd15b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/availability_data.rb
@@ -0,0 +1,34 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class AvailabilityData
+ include JsonSerializable
+
+ attr_accessor :ver, :id, :name, :duration, :success, :run_location, :message,
+ :properties, :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ id: 'id',
+ name: 'name',
+ duration: 'duration',
+ success: 'success',
+ run_location: 'runLocation',
+ message: 'message',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/base.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/base.rb
new file mode 100644
index 000000000..bb88a4625
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/base.rb
@@ -0,0 +1,13 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Base
+ include JsonSerializable
+
+ attr_accessor :base_type
+
+ attribute_mapping(
+ base_type: 'baseType'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/cloud.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/cloud.rb
new file mode 100644
index 000000000..5aaeeee04
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/cloud.rb
@@ -0,0 +1,14 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Cloud
+ include JsonSerializable
+
+ attr_accessor :role, :role_instance
+
+ attribute_mapping(
+ role: 'ai.cloud.role',
+ role_instance: 'ai.cloud.roleInstance'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data.rb
new file mode 100644
index 000000000..c7184edfd
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data.rb
@@ -0,0 +1,14 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Data
+ include JsonSerializable
+
+ attr_accessor :base_type, :base_data
+
+ attribute_mapping(
+ base_type: 'baseType',
+ base_data: 'baseData'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point.rb
new file mode 100644
index 000000000..6556b351b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point.rb
@@ -0,0 +1,25 @@
+require_relative 'json_serializable'
+require_relative 'data_point_type'
+
+module ApplicationInsights::Channel::Contracts
+ class DataPoint
+ include JsonSerializable
+
+ attr_accessor :ns, :name, :kind, :value, :count, :min, :max, :std_dev
+
+ attribute_mapping(
+ ns: 'ns',
+ name: 'name',
+ kind: 'kind',
+ value: 'value',
+ count: 'count',
+ min: 'min',
+ max: 'max',
+ std_dev: 'stdDev'
+ )
+
+ def kind
+ @kind ||= DataPointType::MEASUREMENT
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point_type.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point_type.rb
new file mode 100644
index 000000000..f9816e4a9
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/data_point_type.rb
@@ -0,0 +1,7 @@
+module ApplicationInsights::Channel::Contracts
+ class DataPointType
+ MEASUREMENT = 0
+
+ AGGREGATION = 1
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_kind.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_kind.rb
new file mode 100644
index 000000000..38a441499
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_kind.rb
@@ -0,0 +1,9 @@
+module ApplicationInsights::Channel::Contracts
+ class DependencyKind
+ SQL = 0
+
+ HTTP = 1
+
+ OTHER = 2
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_source_type.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_source_type.rb
new file mode 100644
index 000000000..a68dad72b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/dependency_source_type.rb
@@ -0,0 +1,9 @@
+module ApplicationInsights::Channel::Contracts
+ class DependencySourceType
+ UNDEFINED = 0
+
+ AIC = 1
+
+ APMC = 2
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/device.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/device.rb
new file mode 100644
index 000000000..af6855102
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/device.rb
@@ -0,0 +1,18 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Device
+ include JsonSerializable
+
+ attr_accessor :id, :locale, :model, :oem_name, :os_version, :type
+
+ attribute_mapping(
+ id: 'ai.device.id',
+ locale: 'ai.device.locale',
+ model: 'ai.device.model',
+ oem_name: 'ai.device.oemName',
+ os_version: 'ai.device.osVersion',
+ type: 'ai.device.type'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/domain.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/domain.rb
new file mode 100644
index 000000000..8a7ba880d
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/domain.rb
@@ -0,0 +1,10 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Domain
+ include JsonSerializable
+
+ attribute_mapping(
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/envelope.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/envelope.rb
new file mode 100644
index 000000000..b8608e388
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/envelope.rb
@@ -0,0 +1,32 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Envelope
+ include JsonSerializable
+
+ attr_accessor :ver, :name, :time, :sample_rate, :seq, :i_key, :tags, :data
+
+ attribute_mapping(
+ ver: 'ver',
+ name: 'name',
+ time: 'time',
+ sample_rate: 'sampleRate',
+ seq: 'seq',
+ i_key: 'iKey',
+ tags: 'tags',
+ data: 'data'
+ )
+
+ def ver
+ @ver ||= 1
+ end
+
+ def sample_rate
+ @sample_rate ||= 100.0
+ end
+
+ def tags
+ @tags ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/event_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/event_data.rb
new file mode 100644
index 000000000..4bfb16124
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/event_data.rb
@@ -0,0 +1,28 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class EventData
+ include JsonSerializable
+
+ attr_accessor :ver, :name, :properties, :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ name: 'name',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_data.rb
new file mode 100644
index 000000000..5cffd1253
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_data.rb
@@ -0,0 +1,35 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class ExceptionData
+ include JsonSerializable
+
+ attr_accessor :ver, :exceptions, :severity_level, :problem_id, :properties,
+ :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ exceptions: 'exceptions',
+ severity_level: 'severityLevel',
+ problem_id: 'problemId',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def exceptions
+ @exceptions ||= []
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_details.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_details.rb
new file mode 100644
index 000000000..85bfc6282
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/exception_details.rb
@@ -0,0 +1,28 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class ExceptionDetails
+ include JsonSerializable
+
+ attr_accessor :id, :outer_id, :type_name, :message, :has_full_stack, :stack,
+ :parsed_stack
+
+ attribute_mapping(
+ id: 'id',
+ outer_id: 'outerId',
+ type_name: 'typeName',
+ message: 'message',
+ has_full_stack: 'hasFullStack',
+ stack: 'stack',
+ parsed_stack: 'parsedStack'
+ )
+
+ def has_full_stack
+ @has_full_stack.nil? ? true : @has_full_stack
+ end
+
+ def parsed_stack
+ @parsed_stack ||= []
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/internal.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/internal.rb
new file mode 100644
index 000000000..6e8f3d300
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/internal.rb
@@ -0,0 +1,15 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Internal
+ include JsonSerializable
+
+ attr_accessor :sdk_version, :agent_version, :node_name
+
+ attribute_mapping(
+ sdk_version: 'ai.internal.sdkVersion',
+ agent_version: 'ai.internal.agentVersion',
+ node_name: 'ai.internal.nodeName'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/json_serializable.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/json_serializable.rb
new file mode 100644
index 000000000..60838e215
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/json_serializable.rb
@@ -0,0 +1,59 @@
+require 'yajl/json_gem'
+
+module ApplicationInsights
+ module Channel
+ module Contracts
+ module JsonSerializable
+ module ClassMethods
+ attr_reader :json_mappings
+
+ def attribute_mapping(mappings = {})
+ @json_mappings = mappings
+ end
+ end
+
+ def self.included(klass)
+ klass.extend JsonSerializable::ClassMethods
+ end
+
+ def initialize(attributes = {})
+ attributes.each { |k, v| send(:"#{k}=", v) }
+ end
+
+ def to_h
+ output = {}
+ klass = self.class
+
+ klass.json_mappings.each do |attr, name|
+ value = visit self.send(attr)
+ is_empty = value.respond_to?(:empty?) && value.empty?
+
+ output[name] = value unless value.nil? || is_empty
+ end
+
+ output
+ end
+
+ def to_json(args = {})
+ JSON.generate self.to_h, args
+ end
+
+ private
+
+ def visit(object)
+ return if object.nil?
+
+ if object.is_a? Array
+ object.map { |e| visit e }
+ elsif object.is_a? Hash
+ Hash[object.map { |k, v| [k, visit(v)] }]
+ elsif object.respond_to? :to_h
+ object.to_h
+ else
+ object
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/location.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/location.rb
new file mode 100644
index 000000000..4136c869b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/location.rb
@@ -0,0 +1,13 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Location
+ include JsonSerializable
+
+ attr_accessor :ip
+
+ attribute_mapping(
+ ip: 'ai.location.ip'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/message_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/message_data.rb
new file mode 100644
index 000000000..1340f5ba7
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/message_data.rb
@@ -0,0 +1,24 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class MessageData
+ include JsonSerializable
+
+ attr_accessor :ver, :message, :severity_level, :properties
+
+ attribute_mapping(
+ ver: 'ver',
+ message: 'message',
+ severity_level: 'severityLevel',
+ properties: 'properties'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/metric_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/metric_data.rb
new file mode 100644
index 000000000..bcb5739d6
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/metric_data.rb
@@ -0,0 +1,27 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class MetricData
+ include JsonSerializable
+
+ attr_accessor :ver, :metrics, :properties
+
+ attribute_mapping(
+ ver: 'ver',
+ metrics: 'metrics',
+ properties: 'properties'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def metrics
+ @metrics ||= []
+ end
+
+ def properties
+ @properties ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/operation.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/operation.rb
new file mode 100644
index 000000000..c86dd111b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/operation.rb
@@ -0,0 +1,17 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Operation
+ include JsonSerializable
+
+ attr_accessor :id, :name, :parent_id, :synthetic_source, :correlation_vector
+
+ attribute_mapping(
+ id: 'ai.operation.id',
+ name: 'ai.operation.name',
+ parent_id: 'ai.operation.parentId',
+ synthetic_source: 'ai.operation.syntheticSource',
+ correlation_vector: 'ai.operation.correlationVector'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_data.rb
new file mode 100644
index 000000000..d17dd2f79
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_data.rb
@@ -0,0 +1,33 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class PageViewData
+ include JsonSerializable
+
+ attr_accessor :ver, :url, :name, :duration, :id, :referrer_uri, :properties,
+ :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ url: 'url',
+ name: 'name',
+ duration: 'duration',
+ id: 'id',
+ referrer_uri: 'referrerUri',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_perf_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_perf_data.rb
new file mode 100644
index 000000000..adde3f3ad
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/page_view_perf_data.rb
@@ -0,0 +1,39 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class PageViewPerfData
+ include JsonSerializable
+
+ attr_accessor :ver, :url, :perf_total, :name, :duration, :network_connect,
+ :sent_request, :received_response, :id, :dom_processing, :referrer_uri,
+ :properties, :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ url: 'url',
+ perf_total: 'perfTotal',
+ name: 'name',
+ duration: 'duration',
+ network_connect: 'networkConnect',
+ sent_request: 'sentRequest',
+ received_response: 'receivedResponse',
+ id: 'id',
+ dom_processing: 'domProcessing',
+ referrer_uri: 'referrerUri',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/remote_dependency_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/remote_dependency_data.rb
new file mode 100644
index 000000000..a238841f6
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/remote_dependency_data.rb
@@ -0,0 +1,40 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class RemoteDependencyData
+ include JsonSerializable
+
+ attr_accessor :ver, :name, :id, :result_code, :duration, :success, :data,
+ :target, :type, :properties, :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ name: 'name',
+ id: 'id',
+ result_code: 'resultCode',
+ duration: 'duration',
+ success: 'success',
+ data: 'data',
+ target: 'target',
+ type: 'type',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def success
+ @success.nil? ? true : @success
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/reopenings.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/reopenings.rb
new file mode 100644
index 000000000..394bf8afb
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/reopenings.rb
@@ -0,0 +1,27 @@
+module ApplicationInsights::Channel::Contracts
+ class ExceptionData
+ def handled_at
+ @properties["handledAt"] if @properties
+ end
+
+ def handled_at=(handled_at)
+ if handled_at
+ @properties ||= {}
+ @properties["handledAt"] = handled_at
+ end
+ end
+ end
+
+ class RequestData
+ def http_method
+ @properties["httpMethod"] if @properties
+ end
+
+ def http_method=(http_method)
+ if http_method
+ @properties ||= {}
+ @properties["httpMethod"] = http_method
+ end
+ end
+ end
+end
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/request_data.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/request_data.rb
new file mode 100644
index 000000000..af2581c2b
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/request_data.rb
@@ -0,0 +1,35 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class RequestData
+ include JsonSerializable
+
+ attr_accessor :ver, :id, :source, :name, :duration, :response_code, :success,
+ :url, :properties, :measurements
+
+ attribute_mapping(
+ ver: 'ver',
+ id: 'id',
+ source: 'source',
+ name: 'name',
+ duration: 'duration',
+ response_code: 'responseCode',
+ success: 'success',
+ url: 'url',
+ properties: 'properties',
+ measurements: 'measurements'
+ )
+
+ def ver
+ @ver ||= 2
+ end
+
+ def properties
+ @properties ||= {}
+ end
+
+ def measurements
+ @measurements ||= {}
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/session.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/session.rb
new file mode 100644
index 000000000..a761c51c5
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/session.rb
@@ -0,0 +1,14 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class Session
+ include JsonSerializable
+
+ attr_accessor :id, :is_first
+
+ attribute_mapping(
+ id: 'ai.session.id',
+ is_first: 'ai.session.isFirst'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/severity_level.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/severity_level.rb
new file mode 100644
index 000000000..322a00ec3
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/severity_level.rb
@@ -0,0 +1,13 @@
+module ApplicationInsights::Channel::Contracts
+ class SeverityLevel
+ VERBOSE = 0
+
+ INFORMATION = 1
+
+ WARNING = 2
+
+ ERROR = 3
+
+ CRITICAL = 4
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/stack_frame.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/stack_frame.rb
new file mode 100644
index 000000000..b4f4b9844
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/stack_frame.rb
@@ -0,0 +1,17 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class StackFrame
+ include JsonSerializable
+
+ attr_accessor :level, :method, :assembly, :file_name, :line
+
+ attribute_mapping(
+ level: 'level',
+ method: 'method',
+ assembly: 'assembly',
+ file_name: 'fileName',
+ line: 'line'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/user.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/user.rb
new file mode 100644
index 000000000..a7ff8a7cf
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/contracts/user.rb
@@ -0,0 +1,15 @@
+require_relative 'json_serializable'
+
+module ApplicationInsights::Channel::Contracts
+ class User
+ include JsonSerializable
+
+ attr_accessor :account_id, :id, :auth_user_id
+
+ attribute_mapping(
+ account_id: 'ai.user.accountId',
+ id: 'ai.user.id',
+ auth_user_id: 'ai.user.authUserId'
+ )
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/event.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/event.rb
new file mode 100644
index 000000000..ae61064f8
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/event.rb
@@ -0,0 +1,68 @@
+require_relative 'queue_base'
+require 'thread'
+
+module ApplicationInsights
+ module Channel
+ # An event class that allows simple cross-thread signalling.
+ #
+ # An object of this type managers an internal flag that can be set to true
+ # via the {#set} method and reset via the {#clear} method. Calling the
+ # {#wait} method will block until the flag is set to true.
+ #
+ # @example
+ # require 'application_insights'
+ # require 'thread'
+ # event = ApplicationInsights::Channel::Event.new
+ # Thread.new do
+ # sleep 1
+ # event.set
+ # end
+ # puts 'Main screen turn on.'
+ # result = event.wait
+ # puts 'All your base are belong to us.'
+ class Event
+ # Initializes a new instance of the class.
+ def initialize
+ @mutex = Mutex.new
+ @condition_variable = ConditionVariable.new
+ @signal = false
+ end
+
+ # The signal value for this object. Note that the value of this property is
+ # not synchronized with respect to {#set} and {#clear} meaning that it
+ # could return false positives or negatives.
+ # @return [Boolean] the signal value.
+ attr_reader :signal
+
+ # Sets the internal flag to true. Calling this method will also cause all
+ # waiting threads to awaken.
+ def set
+ @mutex.synchronize do
+ @signal = true
+ @condition_variable.broadcast
+ end
+ end
+
+ # Sets the internal flag to false.
+ def clear
+ @mutex.synchronize do
+ @signal = false
+ end
+ end
+
+ # Calling this method will block until the internal flag is set to true.
+ # If the flag is set to true before calling this method, we will return
+ # immediately. If the timeout parameter is specified, the method will
+ # unblock after the specified number of seconds.
+ # @param [Fixnum] timeout the timeout for the operation in seconds.
+ # @return [Boolean] the value of the internal flag on exit.
+ def wait(timeout=nil)
+ @mutex.synchronize do
+ @condition_variable.wait(@mutex, timeout) unless @signal
+ end
+
+ @signal
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/queue_base.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/queue_base.rb
new file mode 100644
index 000000000..91226b17f
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/queue_base.rb
@@ -0,0 +1,73 @@
+require 'thread'
+
+module ApplicationInsights
+ module Channel
+ # The base class for all types of queues for use in conjunction with an
+ # implementation of {SenderBase}. The queue will notify the sender that it
+ # needs to pick up items when it reaches {#max_queue_length}, or when the
+ # consumer calls {#flush}.
+ class QueueBase
+ # Initializes a new instance of the class.
+ # @param [SenderBase] sender the sender object that will be used in
+ # conjunction with this queue.
+ def initialize(sender)
+ @queue = Queue.new
+ @max_queue_length = 500
+ self.sender = sender
+ end
+
+ # The maximum number of items that will be held by the queue before the
+ # queue will call the {#flush} method.
+ # @return [Fixnum] the maximum queue size. (defaults to: 500)
+ attr_accessor :max_queue_length
+
+ # The sender that is associated with this queue that this queue will use to
+ # send data to the service.
+ # @return [SenderBase] the sender object.
+ attr_reader :sender
+
+ # Change the sender that is associated with this queue.
+ # @param [SenderBase] sender the sender object.
+ # @return [SenderBase] the sender object.
+ def sender=(sender)
+ @sender = sender
+ @sender.queue = self if sender
+ @sender
+ end
+
+ # Adds the passed in item object to the queue and calls {#flush} if the
+ # size of the queue is larger than {#max_queue_length}. This method does
+ # nothing if the passed in item is nil.
+ # @param [Contracts::Envelope] item the telemetry envelope object to send
+ # to the service.
+ def push(item)
+ return unless item
+
+ @queue.push(item)
+
+ flush if @queue.length >= @max_queue_length
+ end
+
+ # Pops a single item from the queue and returns it. If the queue is empty,
+ # this method will return nil.
+ # @return [Contracts::Envelope] a telemetry envelope object or nil if the
+ # queue is empty.
+ def pop
+ return @queue.pop(true)
+ rescue ThreadError
+ return nil
+ end
+
+ # Flushes the current queue by notifying the {#sender}. This method needs
+ # to be overridden by a concrete implementations of the queue class.
+ def flush
+ end
+
+ # Indicates whether the queue is empty.
+ # @return [Boolean] true if the queue is empty
+ def empty?
+ @queue.empty?
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/sender_base.rb
new file mode 100644
index 000000000..bedbae4ee
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/sender_base.rb
@@ -0,0 +1,96 @@
+require 'yajl/json_gem'
+require 'net/http'
+require 'openssl'
+require 'stringio'
+require 'zlib'
+require 'logger'
+
+module ApplicationInsights
+ module Channel
+ # The base class for all types of senders for use in conjunction with an
+ # implementation of {QueueBase}. The queue will notify the sender that it
+ # needs to pick up items. The concrete sender implementation will listen to
+ # these notifications and will pull items from the queue using
+ # {QueueBase#pop} getting at most {#send_buffer_size} items.
+ # It will then call {#send} using the list of items pulled from the queue.
+ class SenderBase
+ # Initializes a new instance of the class.
+ # @param [String] service_endpoint_uri the address of the service to send
+ # @param [Hash] proxy server configuration to send (optional)
+ # telemetry data to.
+ def initialize(service_endpoint_uri, proxy = {})
+ @service_endpoint_uri = service_endpoint_uri
+ @queue = nil
+ @send_buffer_size = 100
+ @logger = Logger.new(STDOUT)
+ @proxy = proxy
+ end
+
+ # The service endpoint URI where this sender will send data to.
+ # @return [String] the service endpoint URI.
+ attr_accessor :service_endpoint_uri
+
+ # The queue that this sender is draining. While {SenderBase} doesn't
+ # implement any means of doing so, derivations of this class do.
+ # @return [QueueBase] the queue instance that this sender is draining.
+ attr_accessor :queue
+
+ # The buffer size for a single batch of telemetry. This is the maximum number
+ # of items in a single service request that this sender is going to send.
+ # @return [Fixnum] the maximum number of items in a telemetry batch.
+ attr_accessor :send_buffer_size
+
+ # The logger for the sender.
+ attr_accessor :logger
+
+ # The proxy for the sender.
+ attr_accessor :proxy
+
+ # Immediately sends the data passed in to {#service_endpoint_uri}. If the
+ # service request fails, the passed in items are pushed back to the {#queue}.
+ # @param [Array] data_to_send an array of
+ # {Contracts::Envelope} objects to send to the service.
+ def send(data_to_send)
+ uri = URI(@service_endpoint_uri)
+ headers = {
+ 'Accept' => 'application/json',
+ 'Content-Type' => 'application/json; charset=utf-8',
+ 'Content-Encoding' => 'gzip'
+ }
+ request = Net::HTTP::Post.new(uri.path, headers)
+
+ # Use JSON.generate instead of to_json, otherwise it will
+ # default to ActiveSupport::JSON.encode for Rails app
+ json = JSON.generate(data_to_send)
+ compressed_data = compress(json)
+ request.body = compressed_data
+ if @proxy.nil? || @proxy.empty?
+ http = Net::HTTP.new uri.hostname, uri.port
+ else
+ http = Net::HTTP.new(uri.hostname, uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass])
+ end
+ if uri.scheme.downcase == 'https'
+ http.use_ssl = true
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
+ end
+
+ response = http.request(request)
+ http.finish if http.started?
+
+ if !response.kind_of? Net::HTTPSuccess
+ @logger.warn('application_insights') { "Failed to send data: #{response.message}" }
+ end
+ end
+
+ private
+
+ def compress(string)
+ wio = StringIO.new("w")
+ w_gz = Zlib::GzipWriter.new wio, nil, nil
+ w_gz.write(string)
+ w_gz.close
+ wio.string
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_queue.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_queue.rb
new file mode 100644
index 000000000..13c2281ac
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_queue.rb
@@ -0,0 +1,45 @@
+require_relative 'queue_base'
+
+module ApplicationInsights
+ module Channel
+ # A synchronous queue for use in conjunction with the {SynchronousSender}.
+ # The queue will call {SenderBase#send} when it reaches {#max_queue_length},
+ # or when the consumer calls {#flush}.
+ #
+ # @example
+ # require 'application_insights'
+ # require 'thread'
+ # queue = ApplicationInsights::Channel::SynchronousQueue.new nil
+ # queue.max_queue_length = 1
+ # queue.push 1
+ class SynchronousQueue < QueueBase
+ # Initializes a new instance of the class.
+ # @param [SenderBase] sender the sender object that will be used in
+ # conjunction with this queue.
+ def initialize(sender)
+ super sender
+ end
+
+ # Flushes the current queue by by calling {#sender}'s
+ # {SenderBase#send} method.
+ def flush
+ local_sender = @sender
+ return unless local_sender
+
+ while true
+ # get at most send_buffer_size items and send them
+ data = []
+ while data.length < local_sender.send_buffer_size
+ item = pop()
+ break if not item
+ data.push item
+ end
+
+ break if data.length == 0
+
+ local_sender.send(data)
+ end
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_sender.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_sender.rb
new file mode 100644
index 000000000..597e97b9e
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/synchronous_sender.rb
@@ -0,0 +1,22 @@
+require_relative "sender_base"
+
+module ApplicationInsights
+ module Channel
+ # A synchronous sender that works in conjunction with the {SynchronousQueue}.
+ # The queue will call {#send} on the current instance with the data to send.
+ class SynchronousSender < SenderBase
+ SERVICE_ENDPOINT_URI = "https://dc.services.visualstudio.com/v2/track"
+ # Initializes a new instance of the class.
+ # @param [String] service_endpoint_uri the address of the service to send
+ # @param [Hash] proxy server configuration to send (optional)
+ # telemetry data to.
+ def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {})
+ # callers which requires proxy dont require to maintain service endpoint uri which potentially can change
+ if service_endpoint_uri.nil? || service_endpoint_uri.empty?
+ service_endpoint_uri = SERVICE_ENDPOINT_URI
+ end
+ super service_endpoint_uri, proxy
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_channel.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_channel.rb
new file mode 100644
index 000000000..e026ebf7d
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_channel.rb
@@ -0,0 +1,131 @@
+require 'time'
+require_relative 'asynchronous_queue'
+require_relative 'asynchronous_sender'
+require_relative 'telemetry_context'
+require_relative 'synchronous_queue'
+require_relative 'synchronous_sender'
+require_relative 'contracts/envelope'
+require_relative 'contracts/data'
+require_relative 'contracts/internal'
+require_relative '../../application_insights/version'
+
+module ApplicationInsights
+ module Channel
+ # The telemetry channel is responsible for constructing a
+ # {Contracts::Envelope} object from the passed in data and specified
+ # telemetry context.
+ #
+ # @example
+ # require 'application_insights'
+ # channel = ApplicationInsights::Channel::TelemetryChannel.new
+ # event = ApplicationInsights::Channel::Contracts::EventData.new name: 'My event'
+ # channel.write event
+ class TelemetryChannel
+ # Initializes a new instance of the class.
+ # @param [TelemetryContext] context the telemetry context to use when
+ # sending telemetry data.
+ # @param [QueueBase] queue the queue to enqueue the resulting
+ # {Contracts::Envelope} to.
+ def initialize(context=nil, queue=nil)
+ @context = context || TelemetryContext.new
+ @queue = queue || SynchronousQueue.new(SynchronousSender.new)
+ end
+
+ # The context associated with this channel. All {Contracts::Envelope}
+ # objects created by this channel will use this value if it's present or if
+ # none is specified as part of the {#write} call.
+ # @return [TelemetryContext] the context instance
+ # (defaults to: TelemetryContext.new)
+ attr_reader :context
+
+ # The queue associated with this channel. All {Contracts::Envelope} objects
+ # created by this channel will be pushed to this queue.
+ # @return [QueueBase] the queue instance (defaults to: SynchronousQueue.new)
+ attr_reader :queue
+
+ # The sender associated with this channel. This instance will be used to
+ # transmit telemetry to the service.
+ # @return [SenderBase] the sender instance (defaults to: SynchronousSender.new)
+ def sender
+ @queue.sender
+ end
+
+ # Flushes the enqueued data by calling {QueueBase#flush}.
+ def flush
+ @queue.flush
+ end
+
+ # Enqueues the passed in data to the {#queue}. If the caller specifies a
+ # context as well, it will take precedence over the instance in {#context}.
+ # @param [Object] data the telemetry data to send. This will be wrapped in
+ # an {Contracts::Envelope} before being enqueued to the {#queue}.
+ # @param [TelemetryContext] context the override context to use when
+ # constructing the {Contracts::Envelope}.
+ # @param [Time|String] time the timestamp of the telemetry used to construct the
+ # {Contracts::Envelope}.
+ def write(data, context=nil, time=nil)
+ local_context = context || @context
+ raise ArgumentError, 'Context was required but not provided' unless local_context
+
+ if time && time.is_a?(String)
+ local_time = time
+ elsif time && time.is_a?(Time)
+ local_time = time.iso8601(7)
+ else
+ local_time = Time.now.iso8601(7)
+ end
+
+ data_type = data.class.name.gsub(/^.*::/, '')
+ set_properties data, local_context
+ data_attributes = {
+ :base_type => data_type,
+ :base_data => data
+ }
+ envelope_attributes = {
+ :name => 'Microsoft.ApplicationInsights.' + data_type[0..-5],
+ :time => local_time,
+ :i_key => local_context.instrumentation_key,
+ :tags => get_tags(local_context),
+ :data => Contracts::Data.new(data_attributes)
+ }
+ envelope = Contracts::Envelope.new envelope_attributes
+ @queue.push(envelope)
+ end
+
+ private
+
+ def get_tags(context)
+ hash = {}
+ internal_context_attributes = {
+ :sdk_version => 'rb:' + ApplicationInsights::VERSION
+ }
+ internal_context = Contracts::Internal.new internal_context_attributes
+
+ [internal_context,
+ context.application,
+ context.cloud,
+ context.device,
+ context.user,
+ context.session,
+ context.location,
+ context.operation].each { |c| hash.merge!(c.to_h) if c }
+
+ hash.delete_if { |k, v| v.nil? }
+
+ hash
+ end
+
+ def set_properties(data, context)
+ if context.properties
+ properties = data.properties || {}
+ context.properties.each do |key, value|
+ unless properties.key?(key)
+ properties[key] = value
+ end
+ end
+ data.properties = properties
+ end
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_context.rb b/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_context.rb
new file mode 100644
index 000000000..bb24af24e
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/channel/telemetry_context.rb
@@ -0,0 +1,85 @@
+require_relative 'contracts/application'
+require_relative 'contracts/cloud'
+require_relative 'contracts/device'
+require_relative 'contracts/user'
+require_relative 'contracts/session'
+require_relative 'contracts/operation'
+require_relative 'contracts/location'
+
+module ApplicationInsights
+ module Channel
+ # Represents the context for sending telemetry to the
+ # Application Insights service.
+ #
+ # @example
+ # require 'application_insights'
+ # context = ApplicationInsights::Channel::TelemetryContext.new
+ # context.instrumentation_key = ''
+ # context.application.id = 'My application'
+ # context.application.ver = '1.2.3'
+ # context.device.id = 'My current device'
+ # context.device.oem_name = 'Asus'
+ # context.device.model = 'X31A'
+ # context.device.type = "Other"
+ # context.user.id = 'santa@northpole.net'
+ class TelemetryContext
+ # Initializes a new instance of the class.
+ def initialize
+ @instrumentation_key = nil
+ @application = Contracts::Application.new
+ @cloud = Contracts::Cloud.new
+ @device = Contracts::Device.new
+ @user = Contracts::User.new
+ @session = Contracts::Session.new
+ @operation = Contracts::Operation.new
+ @location = Contracts::Location.new
+ @properties = {}
+ end
+
+ # The instrumentation key that is used to identify which
+ # Application Insights application this data is for.
+ # @return [String] the instrumentation key.
+ attr_accessor :instrumentation_key
+
+ # The application context. This contains properties of the
+ # application you are running.
+ # @return [Contracts::Application] the context object.
+ attr_accessor :application
+
+ # The cloud context. This contains properties of the
+ # cloud role you are generating telemetry for.
+ # @return [Contracts::Cloud] the context object.
+ attr_accessor :cloud
+
+ # The device context. This contains properties of the
+ # device you are running on.
+ # @return [Contracts::Device] the context object.
+ attr_accessor :device
+
+ # The user context. This contains properties of the
+ # user you are generating telemetry for.
+ # @return [Contracts::User] the context object.
+ attr_accessor :user
+
+ # The session context. This contains properties of the
+ # session you are generating telemetry for.
+ # @return [Contracts::Session] the context object.
+ attr_accessor :session
+
+ # The operation context. This contains properties of the
+ # operation you are generating telemetry for.
+ # @return [Contracts::Operation] the context object.
+ attr_accessor :operation
+
+ # The location context. This contains properties of the
+ # location you are generating telemetry from.
+ # @return [Contracts::Location] the context object.
+ attr_accessor :location
+
+ # The property context. This contains free-form properties
+ # that you can add to your telemetry.
+ # @return [Hash] the context object.
+ attr_accessor :properties
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/rack/track_request.rb b/source/plugins/ruby-fluentd4/lib/application_insights/rack/track_request.rb
new file mode 100644
index 000000000..62c2b0844
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/rack/track_request.rb
@@ -0,0 +1,154 @@
+require 'rack'
+require 'securerandom'
+require_relative '../channel/contracts/request_data'
+require_relative '../telemetry_client'
+
+module ApplicationInsights
+ module Rack
+ # Track every request and sends the request data to Application Insights.
+ class TrackRequest
+ # Initializes a new instance of the class.
+ # @param [Object] app the inner rack application.
+ # @param [String] instrumentation_key to identify which Application Insights
+ # application this data is for.
+ # @param [Fixnum] buffer_size the buffer size and the buffered requests would
+ # send to Application Insights when buffer is full.
+ # @param [Fixnum] send_interval the frequency (in seconds) to check buffer
+ # and send buffered requests to Application Insights if any.
+ def initialize(app, instrumentation_key, buffer_size = 500, send_interval = 60)
+ @app = app
+ @instrumentation_key = instrumentation_key
+ @buffer_size = buffer_size
+ @send_interval = send_interval
+
+ @sender = Channel::AsynchronousSender.new
+ @sender.send_interval = @send_interval
+ queue = Channel::AsynchronousQueue.new @sender
+ queue.max_queue_length = @buffer_size
+ @channel = Channel::TelemetryChannel.new nil, queue
+
+ @client = TelemetryClient.new @instrumentation_key, @channel
+ end
+
+ # Track requests and send data to Application Insights asynchronously.
+ # @param [Hash] env the rack environment.
+ def call(env)
+ # Build a request ID, incorporating one from our request if one exists.
+ request_id = request_id_header(env['HTTP_REQUEST_ID'])
+ env['ApplicationInsights.request.id'] = request_id
+
+ start = Time.now
+ begin
+ status, headers, response = @app.call(env)
+ rescue Exception => ex
+ status = 500
+ exception = ex
+ end
+ stop = Time.now
+
+ start_time = start.iso8601(7)
+ duration = format_request_duration(stop - start)
+ success = status.to_i < 400
+
+ request = ::Rack::Request.new env
+ options = options_hash(request)
+
+ data = request_data(request_id, start_time, duration, status, success, options)
+ context = telemetry_context(request_id, env['HTTP_REQUEST_ID'])
+
+ @client.channel.write data, context, start_time
+
+ if exception
+ @client.track_exception exception, handled_at: 'Unhandled'
+ raise exception
+ end
+
+ [status, headers, response]
+ end
+
+ private
+
+ def sender=(sender)
+ if sender.is_a? Channel::AsynchronousSender
+ @sender = sender
+ @client.channel.queue.sender = @sender
+ end
+ end
+
+ def client
+ @client
+ end
+
+ def format_request_duration(duration_seconds)
+ if duration_seconds >= 86400
+ # just return 1 day when it takes more than 1 day which should not happen for requests.
+ return "%02d.%02d:%02d:%02d.%07d" % [1, 0, 0, 0, 0]
+ end
+
+ Time.at(duration_seconds).gmtime.strftime("00.%H:%M:%S.%7N")
+ end
+
+ def request_id_header(request_id)
+ valid_request_id_header = valid_request_id(request_id)
+
+ length = valid_request_id_header ? 5 : 10
+ id = SecureRandom.base64(length)
+
+ if valid_request_id_header
+ request_id_has_end = %w[. _].include?(request_id[-1])
+ request_id << '.' unless request_id_has_end
+
+ return "#{request_id}#{id}_"
+ end
+
+ "|#{id}."
+ end
+
+ def valid_request_id(request_id)
+ request_id && request_id[0] == '|'
+ end
+
+ def operation_id(id)
+ # Returns the root ID from the '|' to the first '.' if any.
+ root_start = id[0] == '|' ? 1 : 0
+
+ root_end = id.index('.')
+ root_end = root_end ? root_end - 1 : id.length - root_start
+
+ id[root_start..root_end]
+ end
+
+ def options_hash(request)
+ {
+ name: "#{request.request_method} #{request.path}",
+ http_method: request.request_method,
+ url: request.url
+ }
+ end
+
+ def request_data(request_id, start_time, duration, status, success, options)
+ Channel::Contracts::RequestData.new(
+ :id => request_id || 'Null',
+ :duration => duration || '0:00:00:00.0000000',
+ :response_code => status || 200,
+ :success => success == nil ? true : success,
+ :name => options[:name],
+ :url => options[:url],
+ :properties => options[:properties] || {},
+ :measurements => options[:measurements] || {},
+ # Must initialize http_method after properties because it's actually stored in properties
+ :http_method => options[:http_method]
+ )
+ end
+
+ def telemetry_context(request_id, request_id_header)
+ context = Channel::TelemetryContext.new
+ context.instrumentation_key = @instrumentation_key
+ context.operation.id = operation_id(request_id)
+ context.operation.parent_id = request_id_header
+
+ context
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/telemetry_client.rb b/source/plugins/ruby-fluentd4/lib/application_insights/telemetry_client.rb
new file mode 100644
index 000000000..bd066ae70
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/telemetry_client.rb
@@ -0,0 +1,232 @@
+require_relative 'channel/telemetry_context'
+require_relative 'channel/telemetry_channel'
+require_relative 'channel/contracts/page_view_data'
+require_relative 'channel/contracts/remote_dependency_data'
+require_relative 'channel/contracts/exception_data'
+require_relative 'channel/contracts/exception_details'
+require_relative 'channel/contracts/event_data'
+require_relative 'channel/contracts/data_point'
+require_relative 'channel/contracts/data_point_type'
+require_relative 'channel/contracts/metric_data'
+require_relative 'channel/contracts/message_data'
+require_relative 'channel/contracts/stack_frame'
+require_relative 'channel/contracts/request_data'
+require_relative 'channel/contracts/severity_level'
+require_relative 'channel/contracts/reopenings'
+
+module ApplicationInsights
+ # The telemetry client used for sending all types of telemetry. It serves as
+ # the main entry point for interacting with the Application Insights service.
+ class TelemetryClient
+ # Initializes a new instance of the class.
+ # @param [String] instrumentation_key to identify which Application Insights
+ # application this data is for.
+ # @param [Channel::TelemetryChannel] telemetry_channel the optional telemetry
+ # channel to be used instead of constructing a default one.
+ def initialize(instrumentation_key = nil, telemetry_channel = nil)
+ @context = Channel::TelemetryContext.new
+ @context.instrumentation_key = instrumentation_key
+ @channel = telemetry_channel || Channel::TelemetryChannel.new
+ end
+
+ # The context associated with this client. All data objects created by this
+ # client will be accompanied by this value.
+ # @return [Channel::TelemetryContext] the context instance.
+ attr_reader :context
+
+ # The channel associated with this telemetry client. All data created by this
+ # client will be passed along with the {#context} object to
+ # {Channel::TelemetryChannel#write}
+ # @return [Channel::TelemetryChannel] the channel instance.
+ attr_reader :channel
+
+ # Send information about the page viewed in the application (a web page for
+ # instance).
+ # @param [String] name the name of the page that was viewed.
+ # @param [String] url the URL of the page that was viewed.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::PageViewData} object.
+ # @option options [Fixnum] :duration the duration of the page view in
+ # milliseconds. (defaults to: 0)
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ # @option options [Hash] :measurements the set of custom measurements the
+ # client wants to attach to this data item (defaults to: {})
+ def track_page_view(name, url, options={})
+ data_attributes = {
+ :name => name || 'Null',
+ :url => url,
+ :duration => options[:duration],
+ :properties => options[:properties] || {},
+ :measurements => options[:measurements] || {}
+ }
+ data = Channel::Contracts::PageViewData.new data_attributes
+ self.channel.write(data, self.context)
+ end
+
+ # Send information about a single exception that occurred in the application.
+ # @param [Exception] exception the exception that the client wants to send.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::ExceptionData} object.
+ # @option options [String] :handled_at the type of exception
+ # (defaults to: 'UserCode')
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ # @option options [Hash] :measurements the set of custom measurements the
+ # client wants to attach to this data item (defaults to: {})
+ def track_exception(exception, options={})
+ return unless exception.is_a? Exception
+
+ parsed_stack = []
+ if exception.backtrace
+ frame_pattern = /^(?.*):(?\d+)(\.|:in `((?.*)'$))/
+
+ exception.backtrace.each_with_index do |frame, counter|
+ match = frame_pattern.match frame
+ stack_frame = Channel::Contracts::StackFrame.new(
+ :assembly => 'Unknown',
+ :file_name => match['file'],
+ :level => counter,
+ :line => match['line'],
+ :method => match['method']
+ )
+
+ parsed_stack << stack_frame
+ end
+ end
+
+ details = Channel::Contracts::ExceptionDetails.new(
+ :id => 1,
+ :outer_id => 0,
+ :type_name => exception.class.name,
+ :message => exception.message,
+ :has_full_stack => exception.backtrace != nil,
+ :stack => (exception.backtrace.join("\n") if exception.backtrace),
+ :parsed_stack => parsed_stack
+ )
+
+ data = Channel::Contracts::ExceptionData.new(
+ :exceptions => [details],
+ :properties => options[:properties] || {},
+ :measurements => options[:measurements] || {},
+ # Must initialize handled_at after properties because it's actually stored in properties
+ :handled_at => options.fetch(:handled_at, 'UserCode')
+ )
+
+ self.channel.write(data, self.context)
+ end
+
+ # Send information about a single event that has occurred in the context of
+ # the application.
+ # @param [String] name the data to associate to this event.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::EventData} object.
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ # @option options [Hash] :measurements the set of custom measurements the
+ # client wants to attach to this data item (defaults to: {})
+ def track_event(name, options={})
+ data = Channel::Contracts::EventData.new(
+ :name => name || 'Null',
+ :properties => options[:properties] || {},
+ :measurements => options[:measurements] || {}
+ )
+
+ self.channel.write(data, self.context)
+ end
+
+ # Send information about a single metric data point that was captured for
+ # the application.
+ # @param [String] name the name of the metric that was captured.
+ # @param [Fixnum] value the value of the metric that was captured.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::MetricData} object.
+ # @option options [Channel::Contracts::DataPointType] :type the type of the
+ # metric (defaults to: {Channel::Contracts::DataPointType::AGGREGATION})
+ # @option options [Fixnum] :count the number of metrics that were aggregated
+ # into this data point (defaults to: 0)
+ # @option options [Fixnum] :min the minimum of all metrics collected that
+ # were aggregated into this data point (defaults to: 0)
+ # @option options [Fixnum] :max the maximum of all metrics collected that
+ # were aggregated into this data point (defaults to: 0)
+ # @option options [Fixnum] :std_dev the standard deviation of all metrics
+ # collected that were aggregated into this data point (defaults to: 0)
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ # @option options [Hash] :measurements the set of custom measurements the
+ # client wants to attach to this data item (defaults to: {})
+ def track_metric(name, value, options={})
+ data_point = Channel::Contracts::DataPoint.new(
+ :name => name || 'Null',
+ :value => value || 0,
+ :kind => options[:type] || Channel::Contracts::DataPointType::AGGREGATION,
+ :count => options[:count],
+ :min => options[:min],
+ :max => options[:max],
+ :std_dev => options[:std_dev]
+ )
+
+ data = Channel::Contracts::MetricData.new(
+ :metrics => [data_point],
+ :properties => options[:properties] || {}
+ )
+
+ self.channel.write(data, self.context)
+ end
+
+ # Sends a single trace statement.
+ # @param [String] name the trace statement.
+ # @param [Channel::Contracts::SeverityLevel] severity_level the severity level.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::EventData} object.
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ def track_trace(name, severity_level = nil, options={})
+ data = Channel::Contracts::MessageData.new(
+ :message => name || 'Null',
+ :severity_level => severity_level || Channel::Contracts::SeverityLevel::INFORMATION,
+ :properties => options[:properties] || {}
+ )
+
+ self.channel.write(data, self.context)
+ end
+
+ # Sends a single request.
+ # @param [String] id the unique identifier of the request.
+ # @param (String) start_time the start time of the request.
+ # @param [String] duration the duration to process the request.
+ # @param [String] response_code the response code of the request.
+ # @param [Boolean] success indicates whether the request succeeds or not.
+ # @param [Hash] options the options to create the
+ # {Channel::Contracts::RequestData} object.
+ # @option options [String] :name the name of the request.
+ # @option options [String] :http_method the http method used for the request.
+ # @option options [String] :url the url of the request.
+ # @option options [Hash] :properties the set of custom properties the client
+ # wants attached to this data item. (defaults to: {})
+ # @option options [Hash] :measurements the set of custom measurements the
+ # client wants to attach to this data item (defaults to: {})
+ def track_request(id, start_time, duration, response_code, success, options={})
+ data = Channel::Contracts::RequestData.new(
+ :id => id || 'Null',
+ :duration => duration || '0:00:00:00.0000000',
+ :response_code => response_code || 200,
+ :success => success = nil ? true : success,
+ :name => options[:name],
+ :url => options[:url],
+ :properties => options[:properties] || {},
+ :measurements => options[:measurements] || {},
+ # Must initialize http_method after properties because it's actually stored in properties
+ :http_method => options[:http_method]
+ )
+
+ self.channel.write(data, self.context, start_time)
+ end
+
+ # Flushes data in the queue. Data in the queue will be sent either immediately
+ # irrespective of what sender is being used.
+ def flush
+ self.channel.flush
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/unhandled_exception.rb b/source/plugins/ruby-fluentd4/lib/application_insights/unhandled_exception.rb
new file mode 100644
index 000000000..aa87b6f85
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/unhandled_exception.rb
@@ -0,0 +1,49 @@
+require_relative 'telemetry_client'
+require_relative 'channel/telemetry_channel'
+require_relative 'channel/synchronous_queue'
+require_relative 'channel/synchronous_sender'
+
+include ApplicationInsights
+
+module ApplicationInsights
+ module UnhandledException
+ @sender = nil
+
+ # Auto collects unhandled exception and send to the Application Insights service.
+ # @param (string) instrumentation_key used to identify which Application
+ # Insights application this data is for.
+ # @example
+ # require 'application_insights'
+ # ApplicationInsights::UnhandledException.collect('')
+ # raise Exception, 'Boom!'
+ def self.collect(instrumentation_key)
+ at_exit do
+ # Avoid sending exception more than once if this method got invoked multiple times
+ send(instrumentation_key) unless @sender
+ end
+ end
+
+ # @api private
+ # Send the last raised exception to the Application Insights service if
+ # telemetry_sender is not customized.
+ # @param (string) instrumentation_key used to identify which Application
+ # Insights application this data is for.
+ # @param (SenderBase) telemetry_sender used to send the last raised exception.
+ def self.send(instrumentation_key, telemetry_sender = nil)
+ if $! && !$!.is_a?(SystemExit) && !$!.is_a?(SignalException)
+ if telemetry_sender
+ @sender = telemetry_sender
+ elsif !@sender
+ # Use a synchronized sender to guarantee the data would be sent out once flush
+ @sender = Channel::SynchronousSender.new
+ end
+
+ queue = Channel::SynchronousQueue.new @sender
+ channel = Channel::TelemetryChannel.new nil, queue
+ client = TelemetryClient.new instrumentation_key, channel
+ client.track_exception($!, handled_at: 'Unhandled')
+ client.flush
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/lib/application_insights/version.rb b/source/plugins/ruby-fluentd4/lib/application_insights/version.rb
new file mode 100644
index 000000000..d2d56e833
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/lib/application_insights/version.rb
@@ -0,0 +1,3 @@
+module ApplicationInsights
+ VERSION = '0.5.7'.freeze
+end
diff --git a/source/plugins/ruby-fluentd4/oms_common.rb b/source/plugins/ruby-fluentd4/oms_common.rb
new file mode 100644
index 000000000..dad713765
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/oms_common.rb
@@ -0,0 +1,1020 @@
+module OMS
+
+ MSDockerCImprovHostnameFilePath = '/var/opt/microsoft/docker-cimprov/state/containerhostname'
+ IPV6_REGEX = '\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}'
+ IPV4_Approximate_REGEX = '\d+\.\d+\.\d+\.\d+'
+
+ class RetryRequestException < Exception
+ # Throw this exception to tell the fluentd engine to retry and
+ # inform the output plugin that it is indeed retryable
+ end
+
+ class Common
+ require 'json'
+ require 'yajl'
+ require 'net/http'
+ require 'net/https'
+ require 'time'
+ require 'zlib'
+ require 'digest'
+ require 'date'
+ require 'securerandom'
+
+ require_relative 'omslog'
+ require_relative 'oms_configuration'
+
+ @@OSFullName = nil
+ @@OSName = nil
+ @@OSVersion = nil
+ @@Hostname = nil
+ @@HostnameFilePath = MSDockerCImprovHostnameFilePath
+ @@FQDN = nil
+ @@InstalledDate = nil
+ @@AgentVersion = nil
+ @@CurrentTimeZone = nil
+
+ @@tzMapping = {
+ 'Australia/Darwin' => 'AUS Central Standard Time',
+ 'Australia/Sydney' => 'AUS Eastern Standard Time',
+ 'Australia/Melbourne' => 'AUS Eastern Standard Time',
+ 'Asia/Kabul' => 'Afghanistan Standard Time',
+ 'America/Anchorage' => 'Alaskan Standard Time',
+ 'America/Juneau' => 'Alaskan Standard Time',
+ 'America/Metlakatla' => 'Alaskan Standard Time',
+ 'America/Nome' => 'Alaskan Standard Time',
+ 'America/Sitka' => 'Alaskan Standard Time',
+ 'America/Yakutat' => 'Alaskan Standard Time',
+ 'Asia/Riyadh' => 'Arab Standard Time',
+ 'Asia/Bahrain' => 'Arab Standard Time',
+ 'Asia/Kuwait' => 'Arab Standard Time',
+ 'Asia/Qatar' => 'Arab Standard Time',
+ 'Asia/Aden' => 'Arab Standard Time',
+ 'Asia/Dubai' => 'Arabian Standard Time',
+ 'Asia/Muscat' => 'Arabian Standard Time',
+ 'Etc/GMT-4' => 'Arabian Standard Time',
+ 'Asia/Baghdad' => 'Arabic Standard Time',
+ 'America/Buenos_Aires' => 'Argentina Standard Time',
+ 'America/Argentina/La_Rioja' => 'Argentina Standard Time',
+ 'America/Argentina/Rio_Gallegos' => 'Argentina Standard Time',
+ 'America/Argentina/Salta' => 'Argentina Standard Time',
+ 'America/Argentina/San_Juan' => 'Argentina Standard Time',
+ 'America/Argentina/San_Luis' => 'Argentina Standard Time',
+ 'America/Argentina/Tucuman' => 'Argentina Standard Time',
+ 'America/Argentina/Ushuaia' => 'Argentina Standard Time',
+ 'America/Catamarca' => 'Argentina Standard Time',
+ 'America/Cordoba' => 'Argentina Standard Time',
+ 'America/Jujuy' => 'Argentina Standard Time',
+ 'America/Mendoza' => 'Argentina Standard Time',
+ 'America/Halifax' => 'Atlantic Standard Time',
+ 'Atlantic/Bermuda' => 'Atlantic Standard Time',
+ 'America/Glace_Bay' => 'Atlantic Standard Time',
+ 'America/Goose_Bay' => 'Atlantic Standard Time',
+ 'America/Moncton' => 'Atlantic Standard Time',
+ 'America/Thule' => 'Atlantic Standard Time',
+ 'Asia/Baku' => 'Azerbaijan Standard Time',
+ 'Atlantic/Azores' => 'Azores Standard Time',
+ 'America/Scoresbysund' => 'Azores Standard Time',
+ 'America/Bahia' => 'Bahia Standard Time',
+ 'Asia/Dhaka' => 'Bangladesh Standard Time',
+ 'Asia/Thimphu' => 'Bangladesh Standard Time',
+ 'Europe/Minsk' => 'Belarus Standard Time',
+ 'America/Regina' => 'Canada Central Standard Time',
+ 'America/Swift_Current' => 'Canada Central Standard Time',
+ 'Atlantic/Cape_Verde' => 'Cape Verde Standard Time',
+ 'Etc/GMT+1' => 'Cape Verde Standard Time',
+ 'Asia/Yerevan' => 'Caucasus Standard Time',
+ 'Australia/Adelaide' => 'Cen. Australia Standard Time',
+ 'Australia/Broken_Hill' => 'Cen. Australia Standard Time',
+ 'America/Guatemala' => 'Central America Standard Time',
+ 'America/Belize' => 'Central America Standard Time',
+ 'America/Costa_Rica' => 'Central America Standard Time',
+ 'Pacific/Galapagos' => 'Central America Standard Time',
+ 'America/Tegucigalpa' => 'Central America Standard Time',
+ 'America/Managua' => 'Central America Standard Time',
+ 'America/El_Salvador' => 'Central America Standard Time',
+ 'Etc/GMT+6' => 'Central America Standard Time',
+ 'Asia/Almaty' => 'Central Asia Standard Time',
+ 'Antarctica/Vostok' => 'Central Asia Standard Time',
+ 'Indian/Chagos' => 'Central Asia Standard Time',
+ 'Asia/Bishkek' => 'Central Asia Standard Time',
+ 'Asia/Qyzylorda' => 'Central Asia Standard Time',
+ 'Etc/GMT-6' => 'Central Asia Standard Time',
+ 'America/Cuiaba' => 'Central Brazilian Standard Time',
+ 'America/Campo_Grande' => 'Central Brazilian Standard Time',
+ 'Europe/Budapest' => 'Central Europe Standard Time',
+ 'Europe/Tirane' => 'Central Europe Standard Time',
+ 'Europe/Prague' => 'Central Europe Standard Time',
+ 'Europe/Podgorica' => 'Central Europe Standard Time',
+ 'Europe/Belgrade' => 'Central Europe Standard Time',
+ 'Europe/Ljubljana' => 'Central Europe Standard Time',
+ 'Europe/Bratislava' => 'Central Europe Standard Time',
+ 'Europe/Warsaw' => 'Central European Standard Time',
+ 'Europe/Sarajevo' => 'Central European Standard Time',
+ 'Europe/Zagreb' => 'Central European Standard Time',
+ 'Europe/Skopje' => 'Central European Standard Time',
+ 'Pacific/Guadalcanal' => 'Central Pacific Standard Time',
+ 'Antarctica/Macquarie' => 'Central Pacific Standard Time',
+ 'Pacific/Ponape' => 'Central Pacific Standard Time',
+ 'Pacific/Kosrae' => 'Central Pacific Standard Time',
+ 'Pacific/Noumea' => 'Central Pacific Standard Time',
+ 'Pacific/Norfolk' => 'Central Pacific Standard Time',
+ 'Pacific/Bougainville' => 'Central Pacific Standard Time',
+ 'Pacific/Efate' => 'Central Pacific Standard Time',
+ 'Etc/GMT-11' => 'Central Pacific Standard Time',
+ 'America/Chicago' => 'Central Standard Time',
+ 'America/Winnipeg' => 'Central Standard Time',
+ 'America/Rainy_River' => 'Central Standard Time',
+ 'America/Rankin_Inlet' => 'Central Standard Time',
+ 'America/Resolute' => 'Central Standard Time',
+ 'America/Matamoros' => 'Central Standard Time',
+ 'America/Indiana/Knox' => 'Central Standard Time',
+ 'America/Indiana/Tell_City' => 'Central Standard Time',
+ 'America/Menominee' => 'Central Standard Time',
+ 'America/North_Dakota/Beulah' => 'Central Standard Time',
+ 'America/North_Dakota/Center' => 'Central Standard Time',
+ 'America/North_Dakota/New_Salem' => 'Central Standard Time',
+ 'CST6CDT' => 'Central Standard Time',
+ 'America/Mexico_City' => 'Central Standard Time (Mexico)',
+ 'America/Bahia_Banderas' => 'Central Standard Time (Mexico)',
+ 'America/Merida' => 'Central Standard Time (Mexico)',
+ 'America/Monterrey' => 'Central Standard Time (Mexico)',
+ 'Asia/Shanghai' => 'China Standard Time',
+ 'Asia/Chongqing' => 'China Standard Time',
+ 'Asia/Harbin' => 'China Standard Time',
+ 'Asia/Kashgar' => 'China Standard Time',
+ 'Asia/Urumqi' => 'China Standard Time',
+ 'Asia/Hong_Kong' => 'China Standard Time',
+ 'Asia/Macau' => 'China Standard Time',
+ 'Etc/GMT+12' => 'Dateline Standard Time',
+ 'Africa/Nairobi' => 'E. Africa Standard Time',
+ 'Antarctica/Syowa' => 'E. Africa Standard Time',
+ 'Africa/Djibouti' => 'E. Africa Standard Time',
+ 'Africa/Asmera' => 'E. Africa Standard Time',
+ 'Africa/Addis_Ababa' => 'E. Africa Standard Time',
+ 'Indian/Comoro' => 'E. Africa Standard Time',
+ 'Indian/Antananarivo' => 'E. Africa Standard Time',
+ 'Africa/Khartoum' => 'E. Africa Standard Time',
+ 'Africa/Mogadishu' => 'E. Africa Standard Time',
+ 'Africa/Juba' => 'E. Africa Standard Time',
+ 'Africa/Dar_es_Salaam' => 'E. Africa Standard Time',
+ 'Africa/Kampala' => 'E. Africa Standard Time',
+ 'Indian/Mayotte' => 'E. Africa Standard Time',
+ 'Etc/GMT-3' => 'E. Africa Standard Time',
+ 'Australia/Brisbane' => 'E. Australia Standard Time',
+ 'Australia/Lindeman' => 'E. Australia Standard Time',
+ 'Europe/Chisinau' => 'E. Europe Standard Time',
+ 'America/Sao_Paulo' => 'E. South America Standard Time',
+ 'America/New_York' => 'Eastern Standard Time',
+ 'America/Nassau' => 'Eastern Standard Time',
+ 'America/Toronto' => 'Eastern Standard Time',
+ 'America/Iqaluit' => 'Eastern Standard Time',
+ 'America/Montreal' => 'Eastern Standard Time',
+ 'America/Nipigon' => 'Eastern Standard Time',
+ 'America/Pangnirtung' => 'Eastern Standard Time',
+ 'America/Thunder_Bay' => 'Eastern Standard Time',
+ 'America/Havana' => 'Eastern Standard Time',
+ 'America/Port-au-Prince' => 'Eastern Standard Time',
+ 'America/Detroit' => 'Eastern Standard Time',
+ 'America/Indiana/Petersburg' => 'Eastern Standard Time',
+ 'America/Indiana/Vincennes' => 'Eastern Standard Time',
+ 'America/Indiana/Winamac' => 'Eastern Standard Time',
+ 'America/Kentucky/Monticello' => 'Eastern Standard Time',
+ 'America/Louisville' => 'Eastern Standard Time',
+ 'EST5EDT' => 'Eastern Standard Time',
+ 'America/Cancun' => 'Eastern Standard Time (Mexico)',
+ 'Africa/Cairo' => 'Egypt Standard Time',
+ 'Asia/Gaza' => 'Egypt Standard Time',
+ 'Asia/Hebron' => 'Egypt Standard Time',
+ 'Asia/Yekaterinburg' => 'Ekaterinburg Standard Time',
+ 'Europe/Kiev' => 'FLE Standard Time',
+ 'Europe/Mariehamn' => 'FLE Standard Time',
+ 'Europe/Sofia' => 'FLE Standard Time',
+ 'Europe/Tallinn' => 'FLE Standard Time',
+ 'Europe/Helsinki' => 'FLE Standard Time',
+ 'Europe/Vilnius' => 'FLE Standard Time',
+ 'Europe/Riga' => 'FLE Standard Time',
+ 'Europe/Uzhgorod' => 'FLE Standard Time',
+ 'Europe/Zaporozhye' => 'FLE Standard Time',
+ 'Pacific/Fiji' => 'Fiji Standard Time',
+ 'Europe/London' => 'GMT Standard Time',
+ 'Atlantic/Canary' => 'GMT Standard Time',
+ 'Atlantic/Faeroe' => 'GMT Standard Time',
+ 'Europe/Guernsey' => 'GMT Standard Time',
+ 'Europe/Dublin' => 'GMT Standard Time',
+ 'Europe/Isle_of_Man' => 'GMT Standard Time',
+ 'Europe/Jersey' => 'GMT Standard Time',
+ 'Europe/Lisbon' => 'GMT Standard Time',
+ 'Atlantic/Madeira' => 'GMT Standard Time',
+ 'Europe/Bucharest' => 'GTB Standard Time',
+ 'Asia/Nicosia' => 'GTB Standard Time',
+ 'Europe/Athens' => 'GTB Standard Time',
+ 'Asia/Tbilisi' => 'Georgian Standard Time',
+ 'America/Godthab' => 'Greenland Standard Time',
+ 'Atlantic/Reykjavik' => 'Greenwich Standard Time',
+ 'Africa/Ouagadougou' => 'Greenwich Standard Time',
+ 'Africa/Abidjan' => 'Greenwich Standard Time',
+ 'Africa/Accra' => 'Greenwich Standard Time',
+ 'Africa/Banjul' => 'Greenwich Standard Time',
+ 'Africa/Conakry' => 'Greenwich Standard Time',
+ 'Africa/Bissau' => 'Greenwich Standard Time',
+ 'Africa/Monrovia' => 'Greenwich Standard Time',
+ 'Africa/Bamako' => 'Greenwich Standard Time',
+ 'Africa/Nouakchott' => 'Greenwich Standard Time',
+ 'Atlantic/St_Helena' => 'Greenwich Standard Time',
+ 'Africa/Freetown' => 'Greenwich Standard Time',
+ 'Africa/Dakar' => 'Greenwich Standard Time',
+ 'Africa/Sao_Tome' => 'Greenwich Standard Time',
+ 'Africa/Lome' => 'Greenwich Standard Time',
+ 'Pacific/Honolulu' => 'Hawaiian Standard Time',
+ 'Pacific/Rarotonga' => 'Hawaiian Standard Time',
+ 'Pacific/Tahiti' => 'Hawaiian Standard Time',
+ 'Pacific/Johnston' => 'Hawaiian Standard Time',
+ 'Etc/GMT+10' => 'Hawaiian Standard Time',
+ 'Asia/Calcutta' => 'India Standard Time',
+ 'Asia/Tehran' => 'Iran Standard Time',
+ 'Asia/Jerusalem' => 'Israel Standard Time',
+ 'Asia/Amman' => 'Jordan Standard Time',
+ 'Europe/Kaliningrad' => 'Kaliningrad Standard Time',
+ 'Asia/Seoul' => 'Korea Standard Time',
+ 'Africa/Tripoli' => 'Libya Standard Time',
+ 'Pacific/Kiritimati' => 'Line Islands Standard Time',
+ 'Etc/GMT-14' => 'Line Islands Standard Time',
+ 'Asia/Magadan' => 'Magadan Standard Time',
+ 'Indian/Mauritius' => 'Mauritius Standard Time',
+ 'Indian/Reunion' => 'Mauritius Standard Time',
+ 'Indian/Mahe' => 'Mauritius Standard Time',
+ 'Asia/Beirut' => 'Middle East Standard Time',
+ 'America/Montevideo' => 'Montevideo Standard Time',
+ 'Africa/Casablanca' => 'Morocco Standard Time',
+ 'Africa/El_Aaiun' => 'Morocco Standard Time',
+ 'America/Denver' => 'Mountain Standard Time',
+ 'America/Edmonton' => 'Mountain Standard Time',
+ 'America/Cambridge_Bay' => 'Mountain Standard Time',
+ 'America/Inuvik' => 'Mountain Standard Time',
+ 'America/Yellowknife' => 'Mountain Standard Time',
+ 'America/Ojinaga' => 'Mountain Standard Time',
+ 'America/Boise' => 'Mountain Standard Time',
+ 'MST7MDT' => 'Mountain Standard Time',
+ 'America/Chihuahua' => 'Mountain Standard Time (Mexico)',
+ 'America/Mazatlan' => 'Mountain Standard Time (Mexico)',
+ 'Asia/Rangoon' => 'Myanmar Standard Time',
+ 'Indian/Cocos' => 'Myanmar Standard Time',
+ 'Asia/Novosibirsk' => 'N. Central Asia Standard Time',
+ 'Asia/Omsk' => 'N. Central Asia Standard Time',
+ 'Africa/Windhoek' => 'Namibia Standard Time',
+ 'Asia/Katmandu' => 'Nepal Standard Time',
+ 'Pacific/Auckland' => 'New Zealand Standard Time',
+ 'Antarctica/McMurdo' => 'New Zealand Standard Time',
+ 'America/St_Johns' => 'Newfoundland Standard Time',
+ 'Asia/Irkutsk' => 'North Asia East Standard Time',
+ 'Asia/Krasnoyarsk' => 'North Asia Standard Time',
+ 'Asia/Novokuznetsk' => 'North Asia Standard Time',
+ 'Asia/Pyongyang' => 'North Korea Standard Time',
+ 'America/Santiago' => 'Pacific SA Standard Time',
+ 'Antarctica/Palmer' => 'Pacific SA Standard Time',
+ 'America/Los_Angeles' => 'Pacific Standard Time',
+ 'America/Vancouver' => 'Pacific Standard Time',
+ 'America/Dawson' => 'Pacific Standard Time',
+ 'America/Whitehorse' => 'Pacific Standard Time',
+ 'America/Tijuana' => 'Pacific Standard Time',
+ 'America/Santa_Isabel' => 'Pacific Standard Time',
+ 'PST8PDT' => 'Pacific Standard Time',
+ 'Asia/Karachi' => 'Pakistan Standard Time',
+ 'America/Asuncion' => 'Paraguay Standard Time',
+ 'Europe/Paris' => 'Romance Standard Time',
+ 'Europe/Brussels' => 'Romance Standard Time',
+ 'Europe/Copenhagen' => 'Romance Standard Time',
+ 'Europe/Madrid' => 'Romance Standard Time',
+ 'Africa/Ceuta' => 'Romance Standard Time',
+ 'Asia/Srednekolymsk' => 'Russia Time Zone 10',
+ 'Asia/Kamchatka' => 'Russia Time Zone 11',
+ 'Asia/Anadyr' => 'Russia Time Zone 11',
+ 'Europe/Samara' => 'Russia Time Zone 3',
+ 'Europe/Moscow' => 'Russian Standard Time',
+ 'Europe/Simferopol' => 'Russian Standard Time',
+ 'Europe/Volgograd' => 'Russian Standard Time',
+ 'America/Cayenne' => 'SA Eastern Standard Time',
+ 'Antarctica/Rothera' => 'SA Eastern Standard Time',
+ 'America/Fortaleza' => 'SA Eastern Standard Time',
+ 'America/Araguaina' => 'SA Eastern Standard Time',
+ 'America/Belem' => 'SA Eastern Standard Time',
+ 'America/Maceio' => 'SA Eastern Standard Time',
+ 'America/Recife' => 'SA Eastern Standard Time',
+ 'America/Santarem' => 'SA Eastern Standard Time',
+ 'Atlantic/Stanley' => 'SA Eastern Standard Time',
+ 'America/Paramaribo' => 'SA Eastern Standard Time',
+ 'Etc/GMT+3' => 'SA Eastern Standard Time',
+ 'America/Bogota' => 'SA Pacific Standard Time',
+ 'America/Rio_Branco' => 'SA Pacific Standard Time',
+ 'America/Eirunepe' => 'SA Pacific Standard Time',
+ 'America/Coral_Harbour' => 'SA Pacific Standard Time',
+ 'Pacific/Easter' => 'SA Pacific Standard Time',
+ 'America/Guayaquil' => 'SA Pacific Standard Time',
+ 'America/Jamaica' => 'SA Pacific Standard Time',
+ 'America/Cayman' => 'SA Pacific Standard Time',
+ 'America/Panama' => 'SA Pacific Standard Time',
+ 'America/Lima' => 'SA Pacific Standard Time',
+ 'Etc/GMT+5' => 'SA Pacific Standard Time',
+ 'America/La_Paz' => 'SA Western Standard Time',
+ 'America/Antigua' => 'SA Western Standard Time',
+ 'America/Anguilla' => 'SA Western Standard Time',
+ 'America/Aruba' => 'SA Western Standard Time',
+ 'America/Barbados' => 'SA Western Standard Time',
+ 'America/St_Barthelemy' => 'SA Western Standard Time',
+ 'America/Kralendijk' => 'SA Western Standard Time',
+ 'America/Manaus' => 'SA Western Standard Time',
+ 'America/Boa_Vista' => 'SA Western Standard Time',
+ 'America/Porto_Velho' => 'SA Western Standard Time',
+ 'America/Blanc-Sablon' => 'SA Western Standard Time',
+ 'America/Curacao' => 'SA Western Standard Time',
+ 'America/Dominica' => 'SA Western Standard Time',
+ 'America/Santo_Domingo' => 'SA Western Standard Time',
+ 'America/Grenada' => 'SA Western Standard Time',
+ 'America/Guadeloupe' => 'SA Western Standard Time',
+ 'America/Guyana' => 'SA Western Standard Time',
+ 'America/St_Kitts' => 'SA Western Standard Time',
+ 'America/St_Lucia' => 'SA Western Standard Time',
+ 'America/Marigot' => 'SA Western Standard Time',
+ 'America/Martinique' => 'SA Western Standard Time',
+ 'America/Montserrat' => 'SA Western Standard Time',
+ 'America/Puerto_Rico' => 'SA Western Standard Time',
+ 'America/Lower_Princes' => 'SA Western Standard Time',
+ 'America/Grand_Turk' => 'SA Western Standard Time',
+ 'America/Port_of_Spain' => 'SA Western Standard Time',
+ 'America/St_Vincent' => 'SA Western Standard Time',
+ 'America/Tortola' => 'SA Western Standard Time',
+ 'America/St_Thomas' => 'SA Western Standard Time',
+ 'Etc/GMT+4' => 'SA Western Standard Time',
+ 'Asia/Bangkok' => 'SE Asia Standard Time',
+ 'Antarctica/Davis' => 'SE Asia Standard Time',
+ 'Indian/Christmas' => 'SE Asia Standard Time',
+ 'Asia/Jakarta' => 'SE Asia Standard Time',
+ 'Asia/Pontianak' => 'SE Asia Standard Time',
+ 'Asia/Phnom_Penh' => 'SE Asia Standard Time',
+ 'Asia/Vientiane' => 'SE Asia Standard Time',
+ 'Asia/Hovd' => 'SE Asia Standard Time',
+ 'Asia/Saigon' => 'SE Asia Standard Time',
+ 'Etc/GMT-7' => 'SE Asia Standard Time',
+ 'Pacific/Apia' => 'Samoa Standard Time',
+ 'Asia/Singapore' => 'Singapore Standard Time',
+ 'Asia/Brunei' => 'Singapore Standard Time',
+ 'Asia/Makassar' => 'Singapore Standard Time',
+ 'Asia/Kuala_Lumpur' => 'Singapore Standard Time',
+ 'Asia/Kuching' => 'Singapore Standard Time',
+ 'Asia/Manila' => 'Singapore Standard Time',
+ 'Etc/GMT-8' => 'Singapore Standard Time',
+ 'Africa/Johannesburg' => 'South Africa Standard Time',
+ 'Africa/Bujumbura' => 'South Africa Standard Time',
+ 'Africa/Gaborone' => 'South Africa Standard Time',
+ 'Africa/Lubumbashi' => 'South Africa Standard Time',
+ 'Africa/Maseru' => 'South Africa Standard Time',
+ 'Africa/Blantyre' => 'South Africa Standard Time',
+ 'Africa/Maputo' => 'South Africa Standard Time',
+ 'Africa/Kigali' => 'South Africa Standard Time',
+ 'Africa/Mbabane' => 'South Africa Standard Time',
+ 'Africa/Lusaka' => 'South Africa Standard Time',
+ 'Africa/Harare' => 'South Africa Standard Time',
+ 'Etc/GMT-2' => 'South Africa Standard Time',
+ 'Asia/Colombo' => 'Sri Lanka Standard Time',
+ 'Asia/Damascus' => 'Syria Standard Time',
+ 'Asia/Taipei' => 'Taipei Standard Time',
+ 'Australia/Hobart' => 'Tasmania Standard Time',
+ 'Australia/Currie' => 'Tasmania Standard Time',
+ 'Asia/Tokyo' => 'Tokyo Standard Time',
+ 'Asia/Jayapura' => 'Tokyo Standard Time',
+ 'Pacific/Palau' => 'Tokyo Standard Time',
+ 'Asia/Dili' => 'Tokyo Standard Time',
+ 'Etc/GMT-9' => 'Tokyo Standard Time',
+ 'Pacific/Tongatapu' => 'Tonga Standard Time',
+ 'Pacific/Enderbury' => 'Tonga Standard Time',
+ 'Pacific/Fakaofo' => 'Tonga Standard Time',
+ 'Etc/GMT-13' => 'Tonga Standard Time',
+ 'Europe/Istanbul' => 'Turkey Standard Time',
+ 'America/Indianapolis' => 'US Eastern Standard Time',
+ 'America/Indiana/Marengo' => 'US Eastern Standard Time',
+ 'America/Indiana/Vevay' => 'US Eastern Standard Time',
+ 'America/Phoenix' => 'US Mountain Standard Time',
+ 'America/Dawson_Creek' => 'US Mountain Standard Time',
+ 'America/Creston' => 'US Mountain Standard Time',
+ 'America/Fort_Nelson' => 'US Mountain Standard Time',
+ 'America/Hermosillo' => 'US Mountain Standard Time',
+ 'Etc/GMT+7' => 'US Mountain Standard Time',
+ 'Etc/GMT' => 'UTC',
+ 'Etc/UTC' => 'UTC',
+ 'America/Danmarkshavn' => 'UTC',
+ 'Etc/GMT-12' => 'UTC+12',
+ 'Pacific/Tarawa' => 'UTC+12',
+ 'Pacific/Majuro' => 'UTC+12',
+ 'Pacific/Kwajalein' => 'UTC+12',
+ 'Pacific/Nauru' => 'UTC+12',
+ 'Pacific/Funafuti' => 'UTC+12',
+ 'Pacific/Wake' => 'UTC+12',
+ 'Pacific/Wallis' => 'UTC+12',
+ 'Etc/GMT+2' => 'UTC-02',
+ 'America/Noronha' => 'UTC-02',
+ 'Atlantic/South_Georgia' => 'UTC-02',
+ 'Etc/GMT+11' => 'UTC-11',
+ 'Pacific/Pago_Pago' => 'UTC-11',
+ 'Pacific/Niue' => 'UTC-11',
+ 'Pacific/Midway' => 'UTC-11',
+ 'Asia/Ulaanbaatar' => 'Ulaanbaatar Standard Time',
+ 'Asia/Choibalsan' => 'Ulaanbaatar Standard Time',
+ 'America/Caracas' => 'Venezuela Standard Time',
+ 'Asia/Vladivostok' => 'Vladivostok Standard Time',
+ 'Asia/Sakhalin' => 'Vladivostok Standard Time',
+ 'Asia/Ust-Nera' => 'Vladivostok Standard Time',
+ 'Australia/Perth' => 'W. Australia Standard Time',
+ 'Antarctica/Casey' => 'W. Australia Standard Time',
+ 'Africa/Lagos' => 'W. Central Africa Standard Time',
+ 'Africa/Luanda' => 'W. Central Africa Standard Time',
+ 'Africa/Porto-Novo' => 'W. Central Africa Standard Time',
+ 'Africa/Kinshasa' => 'W. Central Africa Standard Time',
+ 'Africa/Bangui' => 'W. Central Africa Standard Time',
+ 'Africa/Brazzaville' => 'W. Central Africa Standard Time',
+ 'Africa/Douala' => 'W. Central Africa Standard Time',
+ 'Africa/Algiers' => 'W. Central Africa Standard Time',
+ 'Africa/Libreville' => 'W. Central Africa Standard Time',
+ 'Africa/Malabo' => 'W. Central Africa Standard Time',
+ 'Africa/Niamey' => 'W. Central Africa Standard Time',
+ 'Africa/Ndjamena' => 'W. Central Africa Standard Time',
+ 'Africa/Tunis' => 'W. Central Africa Standard Time',
+ 'Etc/GMT-1' => 'W. Central Africa Standard Time',
+ 'Europe/Berlin' => 'W. Europe Standard Time',
+ 'Europe/Andorra' => 'W. Europe Standard Time',
+ 'Europe/Vienna' => 'W. Europe Standard Time',
+ 'Europe/Zurich' => 'W. Europe Standard Time',
+ 'Europe/Busingen' => 'W. Europe Standard Time',
+ 'Europe/Gibraltar' => 'W. Europe Standard Time',
+ 'Europe/Rome' => 'W. Europe Standard Time',
+ 'Europe/Vaduz' => 'W. Europe Standard Time',
+ 'Europe/Luxembourg' => 'W. Europe Standard Time',
+ 'Europe/Monaco' => 'W. Europe Standard Time',
+ 'Europe/Malta' => 'W. Europe Standard Time',
+ 'Europe/Amsterdam' => 'W. Europe Standard Time',
+ 'Europe/Oslo' => 'W. Europe Standard Time',
+ 'Europe/Stockholm' => 'W. Europe Standard Time',
+ 'Arctic/Longyearbyen' => 'W. Europe Standard Time',
+ 'Europe/San_Marino' => 'W. Europe Standard Time',
+ 'Europe/Vatican' => 'W. Europe Standard Time',
+ 'Asia/Tashkent' => 'West Asia Standard Time',
+ 'Antarctica/Mawson' => 'West Asia Standard Time',
+ 'Asia/Oral' => 'West Asia Standard Time',
+ 'Asia/Aqtau' => 'West Asia Standard Time',
+ 'Asia/Aqtobe' => 'West Asia Standard Time',
+ 'Indian/Maldives' => 'West Asia Standard Time',
+ 'Indian/Kerguelen' => 'West Asia Standard Time',
+ 'Asia/Dushanbe' => 'West Asia Standard Time',
+ 'Asia/Ashgabat' => 'West Asia Standard Time',
+ 'Asia/Samarkand' => 'West Asia Standard Time',
+ 'Etc/GMT-5' => 'West Asia Standard Time',
+ 'Pacific/Port_Moresby' => 'West Pacific Standard Time',
+ 'Antarctica/DumontDUrville' => 'West Pacific Standard Time',
+ 'Pacific/Truk' => 'West Pacific Standard Time',
+ 'Pacific/Guam' => 'West Pacific Standard Time',
+ 'Pacific/Saipan' => 'West Pacific Standard Time',
+ 'Etc/GMT-10' => 'West Pacific Standard Time',
+ 'Asia/Yakutsk' => 'Yakutsk Standard Time',
+ 'Asia/Chita' => 'Yakutsk Standard Time',
+ 'Asia/Khandyga' => 'Yakutsk Standard Time'
+ }
+
+ @@tzLocalTimePath = '/etc/localtime'
+ @@tzBaseFolder = '/usr/share/zoneinfo/'
+ @@tzRightFolder = 'right/'
+
+ class << self
+
+ # Internal methods
+ # (left public for easy testing, though protected may be better later)
+
+ def clean_hostname_string(hnBuffer)
+ return "" if hnBuffer.nil? # So give the rest of the program a string to deal with.
+ hostname_buffer = hnBuffer.strip
+ return hostname_buffer
+ end
+
+ def has_designated_hostnamefile?
+ return false if @@HostnameFilePath.nil?
+ return false unless @@HostnameFilePath =~ /\w/
+ return false unless File.exist?(@@HostnameFilePath)
+ return true
+ end
+
+ def is_dot_separated_string?(hnBuffer)
+ return true if /[^.]+\.[^.]+/ =~ hnBuffer
+ return false
+ end
+
+ def is_hostname_compliant?(hnBuffer)
+ # RFC 2181:
+ # Size limit is 1 to 63 octets, so probably bytesize is appropriate method.
+ return false if hnBuffer.nil?
+ return false if /\./ =~ hnBuffer # Hostname by definition may not contain a dot.
+ return false if /:/ =~ hnBuffer # Hostname by definition may not contain a colon.
+ return false unless 1 <= hnBuffer.bytesize && hnBuffer.bytesize <= 63
+ return true
+ end
+
+ def is_like_ipv4_string?(hnBuffer)
+ return false unless /\A#{IPV4_Approximate_REGEX}\z/ =~ hnBuffer
+ qwa = hnBuffer.split('.')
+ return false unless qwa.length == 4
+ return false if qwa[0].to_i == 0
+ qwa.each do |quadwordstring|
+ bi = quadwordstring.to_i
+ # This may need more detail if 255 octets are sometimes allowed, but I don't think so.
+ return false unless 0 <= bi and bi < 255
+ end
+ return true
+ end
+
+ def is_like_ipv6_string?(hnBuffer)
+ return true if /\A#{IPV6_REGEX}\z/ =~ hnBuffer
+ return false
+ end
+
+ def look_for_socket_class_host_address
+ hostname_buffer = nil
+
+ begin
+ hostname_buffer = Socket.gethostname
+ rescue => error
+ OMS::Log.error_once("Unable to get the Host Name using socket facility: #{error}")
+ return
+ end
+ @@Hostname = clean_hostname_string(hostname_buffer)
+
+ return # Thwart accidental return to force correct use.
+ end
+
+ def look_in_designated_hostnamefile
+ # Issue:
+ # When omsagent runs inside a container, gethostname returns the hostname of the container (random name)
+ # not the actual machine hostname.
+ # One way to solve this problem is to set the container hostname same as machine name, but this is not
+ # possible when host-machine is a private VM inside a cluster.
+ # Solution:
+ # Share/mount ‘/etc/hostname’ as '/var/opt/microsoft/omsagent/state/containername' with container and
+ # omsagent will read hostname from shared file.
+ hostname_buffer = nil
+
+ unless File.readable?(@@HostnameFilePath)
+ OMS::Log.warn_once("File '#{@@HostnameFilePath}' exists but is not readable.")
+ return
+ end
+
+ begin
+ hostname_buffer = File.read(@@HostnameFilePath)
+ rescue => error
+ OMS::Log.warn_once("Unable to read the hostname from #{@@HostnameFilePath}: #{error}")
+ end
+ @@Hostname = clean_hostname_string(hostname_buffer)
+ return # Thwart accidental return to force correct use.
+ end
+
+ def validate_hostname_equivalent(hnBuffer)
+ # RFC 1123 and 2181
+ # Note that for now we are limiting the earlier maximum of 63 for fqdn labels and thus
+ # hostnames UNTIL we are assured azure will allow 255, as specified in RFC 1123, or
+ # we are otherwise instructed.
+ rfcl = "RFCs 1123, 2181 with hostname range of {1,63} octets for non-root item."
+ return if is_hostname_compliant?(hnBuffer)
+ return if is_like_ipv4_string?(hnBuffer)
+ return if is_like_ipv6_string?(hnBuffer)
+ msg = "Hostname '#{hnBuffer}' not compliant (#{rfcl}). Not IP Address Either."
+ OMS::Log.warn_once(msg)
+ raise NameError, msg
+ end
+
+ # End of Internal methods
+
+ # get the unified timezone id by absolute file path of the timezone file
+ # file path: the absolute path of the file
+ def get_unified_timezoneid(filepath)
+ # remove the baseFolder path
+ tzID = filepath[@@tzBaseFolder.length..-1] if filepath.start_with?(@@tzBaseFolder)
+
+ return 'Unknown' if tzID.nil?
+
+ # if the rest starts with 'right/', remove it to unify the format
+ tzID = tzID[@@tzRightFolder.length..-1] if tzID.start_with?(@@tzRightFolder)
+
+ return tzID
+ end # end get_unified_timezoneid
+
+ def get_current_timezone
+ return @@CurrentTimeZone if !@@CurrentTimeZone.nil?
+
+ tzID = 'Unknown'
+
+ begin
+ # if /etc/localtime is a symlink, check the link file's path
+ if File.symlink?(@@tzLocalTimePath)
+ symlinkpath = File.absolute_path(File.readlink(@@tzLocalTimePath), File.dirname(@@tzLocalTimePath))
+ tzID = get_unified_timezoneid(symlinkpath)
+
+ # look for the entry in the timezone mapping
+ if @@tzMapping.has_key?(tzID)
+ @@CurrentTimeZone = @@tzMapping[tzID]
+ return @@CurrentTimeZone
+ end
+ end
+
+ # calculate the md5 of /etc/locatime
+ md5sum = Digest::MD5.file(@@tzLocalTimePath).hexdigest
+
+ # looks for a file in the /usr/share/zoneinfo/, which is identical to /etc/localtime. use the file name as the timezone
+ Dir.glob("#{@@tzBaseFolder}**/*") { |filepath|
+ # find all the files whose md5 is the same as the /etc/localtime
+ if File.file? filepath and Digest::MD5.file(filepath).hexdigest == md5sum
+ tzID = get_unified_timezoneid(filepath)
+
+ # look for the entry in the timezone mapping
+ if @@tzMapping.has_key?(tzID)
+ @@CurrentTimeZone = @@tzMapping[tzID]
+ return @@CurrentTimeZone
+ end
+ end
+ }
+ rescue => error
+ OMS::Log.error_once("Unable to get the current time zone: #{error}")
+ end
+
+ # assign the tzID if the corresponding Windows Time Zone is not found
+ @@CurrentTimeZone = tzID if @@CurrentTimeZone.nil?
+
+ return @@CurrentTimeZone
+ end # end get_current_timezone
+
+ def get_os_full_name(conf_path = "/etc/opt/microsoft/scx/conf/scx-release")
+ return @@OSFullName if !@@OSFullName.nil?
+
+ if File.file?(conf_path)
+ conf = File.read(conf_path)
+ os_full_name = conf[/OSFullName=(.*?)\n/, 1]
+ if os_full_name and os_full_name.size
+ @@OSFullName = os_full_name
+ end
+ end
+ return @@OSFullName
+ end
+
+ def get_os_name(conf_path = "/etc/opt/microsoft/scx/conf/scx-release")
+ return @@OSName if !@@OSName.nil?
+
+ if File.file?(conf_path)
+ conf = File.read(conf_path)
+ os_name = conf[/OSName=(.*?)\n/, 1]
+ if os_name and os_name.size
+ @@OSName = os_name
+ end
+ end
+ return @@OSName
+ end
+
+ def get_os_version(conf_path = "/etc/opt/microsoft/scx/conf/scx-release")
+ return @@OSVersion if !@@OSVersion.nil?
+
+ if File.file?(conf_path)
+ conf = File.read(conf_path)
+ os_version = conf[/OSVersion=(.*?)\n/, 1]
+ if os_version and os_version.size
+ @@OSVersion = os_version
+ end
+ end
+ return @@OSVersion
+ end
+
+ def get_hostname(ignoreOldValue = false)
+
+ if not is_hostname_compliant?(@@Hostname) or ignoreOldValue then
+
+ look_in_designated_hostnamefile if has_designated_hostnamefile?
+
+ look_for_socket_class_host_address unless is_hostname_compliant?(@@Hostname)
+ end
+
+ begin
+ validate_hostname_equivalent(@@Hostname)
+ rescue => error
+ OMS::Log.warn_once("Hostname '#{@@Hostname}' found, but did NOT validate as compliant. #{error}. Using anyway.")
+ end
+ return @@Hostname
+ end
+
+ def get_fully_qualified_domain_name
+ return @@FQDN unless @@FQDN.nil?
+
+ begin
+ fqdn = Socket.gethostbyname(Socket.gethostname)[0]
+ rescue => error
+ OMS::Log.error_once("Unable to get the FQDN: #{error}")
+ else
+ @@FQDN = fqdn
+ end
+ return @@FQDN
+ end
+
+ def get_installed_date(conf_path = "/etc/opt/microsoft/omsagent/sysconf/installinfo.txt")
+ return @@InstalledDate if !@@InstalledDate.nil?
+
+ if File.file?(conf_path)
+ conf = File.read(conf_path)
+ installed_date = conf[/(.*)\n(.*)/, 2]
+ if installed_date and installed_date.size
+ begin
+ Time.parse(installed_date)
+ rescue ArgumentError
+ OMS::Log.error_once("Invalid install date: #{installed_date}")
+ else
+ @@InstalledDate = installed_date
+ end
+ end
+ end
+ return @@InstalledDate
+ end
+
+ def get_agent_version(conf_path = "/etc/opt/microsoft/omsagent/sysconf/installinfo.txt")
+ return @@AgentVersion if !@@AgentVersion.nil?
+
+ if File.file?(conf_path)
+ conf = File.read(conf_path)
+ agent_version = conf[/([\d]+\.[\d]+\.[\d]+-[\d]+)\s.*\n/, 1]
+ if agent_version and agent_version.size
+ @@AgentVersion = agent_version
+ end
+ end
+ return @@AgentVersion
+ end
+
+ def format_time(time)
+ Time.at(time).utc.iso8601(3) # UTC with milliseconds
+ end
+
+ def format_time_str(time)
+ DateTime.parse(time).strftime("%FT%H:%M:%S.%3NZ")
+ end
+
+ def create_error_tag(tag)
+ "ERROR::#{tag}::"
+ end
+
+ # create an HTTP object which uses HTTPS
+ def create_secure_http(uri, proxy={})
+ if proxy.empty?
+ http = Net::HTTP.new( uri.host, uri.port )
+ else
+ http = Net::HTTP.new( uri.host, uri.port,
+ proxy[:addr], proxy[:port], proxy[:user], proxy[:pass])
+ end
+ http.use_ssl = true
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
+ http.open_timeout = 30
+ return http
+ end # create_secure_http
+
+ # create an HTTP object to ODS
+ def create_ods_http(ods_uri, proxy={})
+ http = create_secure_http(ods_uri, proxy)
+ http.cert = Configuration.cert
+ http.key = Configuration.key
+ return http
+ end # create_ods_http
+
+ # create an HTTPRequest object to ODS
+ # parameters:
+ # path: string. path of the request
+ # record: Hash. body of the request
+ # compress: bool. Whether the body of the request should be compressed
+ # extra_header: Hash. extra HTTP headers
+ # serializer: method. serializer of the record
+ # returns:
+ # HTTPRequest. request to ODS
+ def create_ods_request(path, record, compress, extra_headers=nil, serializer=method(:parse_json_record_encoding))
+ headers = extra_headers.nil? ? {} : extra_headers
+
+ azure_resource_id = OMS::Configuration.azure_resource_id
+ if !azure_resource_id.to_s.empty?
+ headers[OMS::CaseSensitiveString.new("x-ms-AzureResourceId")] = azure_resource_id
+ end
+
+ azure_region = OMS::Configuration.azure_region if defined?(OMS::Configuration.azure_region)
+ if !azure_region.to_s.empty?
+ headers[OMS::CaseSensitiveString.new("x-ms-AzureRegion")] = azure_region
+ end
+
+ omscloud_id = OMS::Configuration.omscloud_id
+ if !omscloud_id.to_s.empty?
+ headers[OMS::CaseSensitiveString.new("x-ms-OMSCloudId")] = omscloud_id
+ end
+
+ uuid = OMS::Configuration.uuid
+ if !uuid.to_s.empty?
+ headers[OMS::CaseSensitiveString.new("x-ms-UUID")] = uuid
+ end
+
+ headers[OMS::CaseSensitiveString.new("X-Request-ID")] = SecureRandom.uuid
+
+ headers["Content-Type"] = "application/json"
+ if compress == true
+ headers["Content-Encoding"] = "deflate"
+ end
+
+ req = Net::HTTP::Post.new(path, headers)
+ json_msg = serializer.call(record)
+ if json_msg.nil?
+ return nil
+ else
+ if compress == true
+ req.body = Zlib::Deflate.deflate(json_msg)
+ else
+ req.body = json_msg
+ end
+ end
+ return req
+ end # create_ods_request
+
+ # parses the json record with appropriate encoding
+ # parameters:
+ # record: Hash. body of the request
+ # returns:
+ # json represention of object,
+ # nil if encoding cannot be applied
+ def parse_json_record_encoding(record)
+ msg = nil
+ begin
+ msg = Yajl.dump(record)
+ rescue => error
+ # failed encoding, encode to utf-8, iso-8859-1 and try again
+ begin
+ OMS::Log.warn_once("Yajl.dump() failed due to encoding, will try iso-8859-1 for #{record}: #{error}")
+
+ if !record["DataItems"].nil?
+ record["DataItems"].each do |item|
+ item["Message"] = item["Message"].encode('utf-8', 'iso-8859-1')
+ end
+ end
+ msg = Yajl.dump(record)
+ rescue => error
+ # at this point we've given up up, we don't recognize
+ # the encode, so return nil and log_warning for the
+ # record
+ OMS::Log.warn_once("Skipping due to failed encoding for #{record}: #{error}")
+ end
+ end
+ return msg
+ end
+
+ # dump the records into json string
+ # assume the records is an array of single layer hash
+ # return nil if we cannot dump it
+ # parameters:
+ # records: hash[]. an array of single layer hash
+ def safe_dump_simple_hash_array(records)
+ msg = nil
+
+ begin
+ msg = JSON.dump(records)
+ rescue JSON::GeneratorError => error
+ OMS::Log.warn_once("Unable to dump to JSON string. #{error}")
+ begin
+ # failed to dump, encode to utf-8, iso-8859-1 and try again
+ # records is an array of hash
+ records.each do | hash |
+ # the value is a hash
+ hash.each do | key, value |
+ # the value should be of simple type
+ # encode the string to utf-8
+ if value.instance_of? String
+ hash[key] = value.encode('utf-8', 'iso-8859-1')
+ end
+ end
+ end
+
+ msg = JSON.dump(records)
+ rescue => error
+ # at this point we've given up, we don't recognize the encode,
+ # so return nil and log_warning for the record
+ OMS::Log.warn_once("Skipping due to failed encoding for #{records}: #{error}")
+ end
+ rescue => error
+ # unexpected error when dumpping the records into JSON string
+ # skip here and return nil
+ OMS::Log.warn_once("Skipping due to unexpected error for #{records}: #{error}")
+ end
+
+ return msg
+ end # safe_dump_simple_hash_array
+
+ # start a request
+ # parameters:
+ # req: HTTPRequest. request
+ # secure_http: HTTP. HTTPS
+ # ignore404: bool. ignore the 404 error when it's true
+ # return_entire_response: bool. If true, return the entire response object
+ # returns:
+ # string. body of the response (or the whole response if return_entire_response is true)
+ def start_request(req, secure_http, ignore404 = false, return_entire_response = false)
+ # Tries to send the passed in request
+ # Raises an exception if the request fails.
+ # This exception should only be caught by the fluentd engine so that it retries sending this
+ begin
+ res = nil
+ res = secure_http.start { |http| http.request(req) }
+ rescue => e # rescue all StandardErrors
+ # Server didn't respond
+ raise RetryRequestException, "Net::HTTP.#{req.method.capitalize} raises exception: #{e.class}, '#{e.message}'"
+ else
+ if res.nil?
+ raise RetryRequestException, "Failed to #{req.method} at #{req.to_s} (res=nil)"
+ end
+
+ if res.is_a?(Net::HTTPSuccess)
+ if return_entire_response
+ return res
+ else
+ return res.body
+ end
+ end
+
+ if ignore404 and res.code == "404"
+ return ''
+ end
+
+ if res.code != "200"
+ # Retry all failure error codes...
+ res_summary = "(request-id=#{req["X-Request-ID"]}; class=#{res.class.name}; code=#{res.code}; message=#{res.message}; body=#{res.body};)"
+ OMS::Log.error_once("HTTP Error: #{res_summary}")
+ raise RetryRequestException, "HTTP error: #{res_summary}"
+ end
+
+ end # end begin
+ end # end start_request
+ end # Class methods
+
+ end # class Common
+
+ class IPcache
+
+ def initialize(refresh_interval_seconds)
+ @cache = {}
+ @cache_lock = Mutex.new
+ @refresh_interval_seconds = refresh_interval_seconds
+ @condition = ConditionVariable.new
+ @thread = Thread.new(&method(:refresh_cache))
+ end
+
+ def get_ip(hostname)
+ @cache_lock.synchronize {
+ if @cache.has_key?(hostname)
+ return @cache[hostname]
+ else
+ ip = get_ip_from_socket(hostname)
+ @cache[hostname] = ip
+ return ip
+ end
+ }
+ end
+
+ private
+
+ def get_ip_from_socket(hostname)
+ begin
+ addrinfos = Socket::getaddrinfo(hostname, "echo", Socket::AF_UNSPEC)
+ rescue => error
+ OMS::Log.error_once("Unable to resolve the IP of '#{hostname}': #{error}")
+ return nil
+ end
+
+ if addrinfos.size >= 1
+ return addrinfos[0][3]
+ end
+
+ return nil
+ end
+
+ def refresh_cache
+ while true
+ @cache_lock.synchronize {
+ @condition.wait(@cache_lock, @refresh_interval_seconds)
+ # Flush the cache completely to prevent it from growing indefinitly
+ @cache = {}
+ }
+ end
+ end
+
+ end
+
+ class CaseSensitiveString < String
+ def downcase
+ self
+ end
+ def capitalize
+ self
+ end
+ def to_s
+ self
+ end
+ end
+
+end # module OMS
diff --git a/source/plugins/ruby-fluentd4/oms_configuration.rb b/source/plugins/ruby-fluentd4/oms_configuration.rb
new file mode 100644
index 000000000..436a0ff65
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/oms_configuration.rb
@@ -0,0 +1,381 @@
+# frozen_string_literal: true
+
+module OMS
+
+ class Configuration
+ require 'openssl'
+ require 'uri'
+
+ require_relative 'omslog'
+
+ @@ConfigurationLoaded = false
+
+ @@Cert = nil
+ @@Key = nil
+
+ @@AgentId = nil
+ @@WorkspaceId = nil
+ @@ODSEndpoint = nil
+ @@DiagnosticEndpoint = nil
+ @@GetBlobODSEndpoint = nil
+ @@NotifyBlobODSEndpoint = nil
+ @@OmsCloudId = nil
+ @@AgentGUID = nil
+ @@URLTLD = nil
+ @@LogFacility = nil
+ @@AzureResourceId = nil
+ @@AzureRegion = nil
+ @@AzureIMDSEndpoint = "http://169.254.169.254/metadata/instance?api-version=2017-12-01"
+ @@AzureResIDThreadLock = Mutex.new
+ @@ProxyConfig = nil
+ @@ProxyConfigFilePath = "/etc/opt/microsoft/omsagent/proxy.conf"
+ @@UUID = nil
+ @@TopologyInterval = nil
+ @@TelemetryInterval = nil
+
+ class << self
+
+ # test the onboard file existence
+ def test_onboard_file(file_name)
+ if !File.file?(file_name)
+ OMS::Log.error_once("Could not find #{file_name} Make sure to onboard.")
+ return false
+ end
+
+ if !File.readable?(file_name)
+ OMS::Log.error_once("Could not read #{file_name} Check that the read permissions are set for the omsagent user")
+ return false
+ end
+
+ return true
+ end
+
+ def get_proxy_config(proxy_conf_path)
+ old_proxy_conf_path = '/etc/opt/microsoft/omsagent/conf/proxy.conf'
+ if !File.exist?(proxy_conf_path) and File.exist?(old_proxy_conf_path)
+ proxy_conf_path = old_proxy_conf_path
+ end
+
+ begin
+ proxy_config = parse_proxy_config(File.read(proxy_conf_path))
+ rescue SystemCallError # Error::ENOENT
+ return {}
+ end
+
+ if proxy_config.nil?
+ OMS::Log.error_once("Failed to parse the proxy configuration in '#{proxy_conf_path}'")
+ return {}
+ end
+
+ return proxy_config
+ end
+
+ def parse_proxy_config(proxy_conf_str)
+ # Remove the http(s) protocol
+ proxy_conf_str = proxy_conf_str.gsub(/^(https?:\/\/)?/, "")
+
+ # Check for unsupported protocol
+ if proxy_conf_str[/^[a-z]+:\/\//]
+ return nil
+ end
+
+ re = /^(?:(?[^:]+):(?[^@]+)@)?(?[^:@]+)(?::(?\d+))?$/
+ matches = re.match(proxy_conf_str)
+ if matches.nil? or matches[:addr].nil?
+ return nil
+ end
+ # Convert nammed matches to a hash
+ Hash[ matches.names.map{ |name| name.to_sym}.zip( matches.captures ) ]
+ end
+
+ def get_azure_region_from_imds()
+ begin
+ uri = URI.parse(@@AzureIMDSEndpoint)
+ http_get_req = Net::HTTP::Get.new(uri, initheader = {'Metadata' => 'true'})
+
+ http_req = Net::HTTP.new(uri.host, uri.port)
+
+ http_req.open_timeout = 3
+ http_req.read_timeout = 2
+
+ res = http_req.start() do |http|
+ http.request(http_get_req)
+ end
+
+ imds_instance_json = JSON.parse(res.body)
+
+ return nil if !imds_instance_json.has_key?("compute") || imds_instance_json['compute'].empty? #classic vm
+
+ imds_instance_json_compute = imds_instance_json['compute']
+ return nil unless imds_instance_json_compute.has_key?("location")
+ return nil if imds_instance_json_compute['location'].empty?
+ return imds_instance_json_compute['location']
+ rescue => e
+ # this may be a container instance or a non-Azure VM
+ return nil
+ end
+ end
+
+ def get_azure_resid_from_imds()
+ begin
+ uri = URI.parse(@@AzureIMDSEndpoint)
+ http_get_req = Net::HTTP::Get.new(uri, initheader = {'Metadata' => 'true'})
+
+ http_req = Net::HTTP.new(uri.host, uri.port)
+
+ http_req.open_timeout = 3
+ http_req.read_timeout = 2
+
+ res = http_req.start() do |http|
+ http.request(http_get_req)
+ end
+
+ imds_instance_json = JSON.parse(res.body)
+
+ return nil if !imds_instance_json.has_key?("compute") || imds_instance_json['compute'].empty? #classic vm
+
+ imds_instance_json_compute = imds_instance_json['compute']
+
+ #guard from missing keys
+ return nil unless imds_instance_json_compute.has_key?("subscriptionId") && imds_instance_json_compute.has_key?("resourceGroupName") && imds_instance_json_compute.has_key?("name") && imds_instance_json_compute.has_key?("vmScaleSetName")
+
+ #guard from blank values
+ return nil if imds_instance_json_compute['subscriptionId'].empty? || imds_instance_json_compute['resourceGroupName'].empty? || imds_instance_json_compute['name'].empty?
+
+ azure_resource_id = '/subscriptions/' + imds_instance_json_compute['subscriptionId'] + '/resourceGroups/' + imds_instance_json_compute['resourceGroupName'] + '/providers/Microsoft.Compute/'
+
+ if (imds_instance_json_compute['vmScaleSetName'].empty?)
+ azure_resource_id = azure_resource_id + 'virtualMachines/' + imds_instance_json_compute['name']
+ else
+ azure_resource_id = azure_resource_id + 'virtualMachineScaleSets/' + imds_instance_json_compute['vmScaleSetName'] + '/virtualMachines/' + imds_instance_json_compute['name']
+ end
+
+ return azure_resource_id
+
+ rescue => e
+ # this may be a container instance or a non-Azure VM
+ OMS::Log.warn_once("Could not fetch Azure Resource ID from IMDS, Reason: #{e}")
+ return nil
+ end
+ end
+
+ def update_azure_resource_id()
+ retries=1
+ max_retries=3
+
+ loop do
+ break if retries > max_retries
+ azure_resource_id = get_azure_resid_from_imds()
+ if azure_resource_id.nil?
+ sleep (retries * 120)
+ retries += 1
+ next
+ end
+
+ @@AzureResourceId = azure_resource_id unless @@AzureResourceId == azure_resource_id
+ retries=1 #reset
+ sleep 60
+ end
+
+ OMS::Log.warn_once("Exceeded max attempts to fetch Azure Resource ID, killing the thread")
+ return #terminate
+ end
+
+ # load the configuration from the configuration file, cert, and key path
+ def load_configuration(conf_path, cert_path, key_path)
+ return true if @@ConfigurationLoaded
+ return false if !test_onboard_file(conf_path) or !test_onboard_file(cert_path) or !test_onboard_file(key_path)
+
+ @@ProxyConfig = get_proxy_config(@@ProxyConfigFilePath)
+
+ endpoint_lines = IO.readlines(conf_path).select{ |line| line.start_with?("OMS_ENDPOINT")}
+ if endpoint_lines.size == 0
+ OMS::Log.error_once("Could not find OMS_ENDPOINT setting in #{conf_path}")
+ return false
+ elsif endpoint_lines.size > 1
+ OMS::Log.warn_once("Found more than one OMS_ENDPOINT setting in #{conf_path}, will use the first one.")
+ end
+
+ begin
+ endpoint_url = endpoint_lines[0].split("=")[1].strip
+ @@ODSEndpoint = URI.parse( endpoint_url )
+ @@GetBlobODSEndpoint = @@ODSEndpoint.clone
+ @@GetBlobODSEndpoint.path = '/ContainerService.svc/GetBlobUploadUri'
+ @@NotifyBlobODSEndpoint = @@ODSEndpoint.clone
+ @@NotifyBlobODSEndpoint.path = '/ContainerService.svc/PostBlobUploadNotification'
+ rescue => e
+ OMS::Log.error_once("Error parsing endpoint url. #{e}")
+ return false
+ end
+
+ begin
+ diagnostic_endpoint_lines = IO.readlines(conf_path).select{ |line| line.start_with?("DIAGNOSTIC_ENDPOINT=")}
+ if diagnostic_endpoint_lines.size == 0
+ # Endpoint to be inferred from @@ODSEndpoint
+ @@DiagnosticEndpoint = @@ODSEndpoint.clone
+ @@DiagnosticEndpoint.path = '/DiagnosticsDataService.svc/PostJsonDataItems'
+ else
+ if diagnostic_endpoint_lines.size > 1
+ OMS::Log.warn_once("Found more than one DIAGNOSTIC_ENDPOINT setting in #{conf_path}, will use the first one.")
+ end
+ diagnostic_endpoint_url = diagnostic_endpoint_lines[0].split("=")[1].strip
+ @@DiagnosticEndpoint = URI.parse( diagnostic_endpoint_url )
+ end
+ rescue => e
+ OMS::Log.error_once("Error obtaining diagnostic endpoint url. #{e}")
+ return false
+ end
+
+ agentid_lines = IO.readlines(conf_path).select{ |line| line.start_with?("AGENT_GUID")}
+ if agentid_lines.size == 0
+ OMS::Log.error_once("Could not find AGENT_GUID setting in #{conf_path}")
+ return false
+ elsif agentid_lines.size > 1
+ OMS::Log.warn_once("Found more than one AGENT_GUID setting in #{conf_path}, will use the first one.")
+ end
+
+ begin
+ @@AgentId = agentid_lines[0].split("=")[1].strip
+ rescue => e
+ OMS::Log.error_once("Error parsing agent id. #{e}")
+ return false
+ end
+
+ File.open(conf_path).each_line do |line|
+ if line =~ /^WORKSPACE_ID/
+ @@WorkspaceId = line.sub("WORKSPACE_ID=","").strip
+ end
+ if line =~ /AZURE_RESOURCE_ID/
+ # We have contract with AKS team about how to pass AKS specific resource id.
+ # As per contract, AKS team before starting the agent will set environment variable
+ # 'customResourceId'
+ @@AzureResourceId = ENV['customResourceId']
+
+ # Only if environment variable is empty/nil load it from imds and refresh it periodically.
+ if @@AzureResourceId.nil? || @@AzureResourceId.empty?
+ @@AzureResourceId = line.sub("AZURE_RESOURCE_ID=","").strip
+ if @@AzureResourceId.include? "Microsoft.ContainerService"
+ OMS::Log.info_once("Azure resource id in configuration file is for AKS. It will be used")
+ else
+ Thread.new(&method(:update_azure_resource_id)) if @@AzureResIDThreadLock.try_lock
+ end
+ else
+ OMS::Log.info_once("There is non empty value set for overriden-resourceId environment variable. It will be used")
+ end
+ end
+ if line =~ /OMSCLOUD_ID/
+ @@OmsCloudId = line.sub("OMSCLOUD_ID=","").strip
+ end
+ if line =~ /^AGENT_GUID/
+ @@AgentGUID = line.sub("AGENT_GUID=","").strip
+ end
+ if line =~ /^URL_TLD/
+ @@URLTLD = line.sub("URL_TLD=","").strip
+ end
+ if line =~ /^LOG_FACILITY/
+ @@LogFacility = line.sub("LOG_FACILITY=","").strip
+ end
+ if line =~ /UUID/
+ @@UUID = line.sub("UUID=","").strip
+ end
+ end
+
+ begin
+ raw = File.read cert_path
+ @@Cert = OpenSSL::X509::Certificate.new raw
+ raw = File.read key_path
+ @@Key = OpenSSL::PKey::RSA.new raw
+ rescue => e
+ OMS::Log.error_once("Error loading certs: #{e}")
+ return false
+ end
+
+ @@AzureRegion = get_azure_region_from_imds()
+ if @@AzureRegion.nil? || @@AzureRegion.empty?
+ OMS::Log.warn_once("Azure region value is not set. This must be onpremise machine")
+ @@AzureRegion = "OnPremise"
+ end
+
+ @@ConfigurationLoaded = true
+ return true
+ end # load_configuration
+
+ def set_request_intervals(topology_interval, telemetry_interval)
+ @@TopologyInterval = topology_interval
+ @@TelemetryInterval = telemetry_interval
+ OMS::Log.info_once("OMS agent management service topology request interval now #{@@TopologyInterval}")
+ OMS::Log.info_once("OMS agent management service telemetry request interval now #{@@TelemetryInterval}")
+ end
+
+ def cert
+ @@Cert
+ end # getter cert
+
+ def key
+ @@Key
+ end # getter key
+
+ def workspace_id
+ @@WorkspaceId
+ end # getter workspace_id
+
+ def agent_id
+ @@AgentId
+ end # getter agent_id
+
+ def ods_endpoint
+ @@ODSEndpoint
+ end # getter ods_endpoint
+
+ def diagnostic_endpoint
+ @@DiagnosticEndpoint
+ end # getter diagnostic_endpoint
+
+ def get_blob_ods_endpoint
+ @@GetBlobODSEndpoint
+ end # getter get_blob_ods_endpoint
+
+ def notify_blob_ods_endpoint
+ @@NotifyBlobODSEndpoint
+ end # getter notify_blob_ods_endpoint
+
+ def azure_resource_id
+ @@AzureResourceId
+ end
+
+ def omscloud_id
+ @@OmsCloudId
+ end
+
+ def agent_guid
+ @@AgentGUID
+ end # getter agent_guid
+
+ def url_tld
+ @@URLTLD
+ end # getter url_tld
+
+ def log_facility
+ @@LogFacility
+ end # getter log_facility
+
+ def uuid
+ @@UUID
+ end # getter for VM uuid
+
+ def azure_region
+ @@AzureRegion
+ end
+
+ def topology_interval
+ @@TopologyInterval
+ end
+
+ def telemetry_interval
+ @@TelemetryInterval
+ end
+
+ end # Class methods
+
+ end # class Common
+end # module OMS
diff --git a/source/plugins/ruby-fluentd4/oms_omi_lib.rb b/source/plugins/ruby-fluentd4/oms_omi_lib.rb
new file mode 100644
index 000000000..dbae8ca5f
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/oms_omi_lib.rb
@@ -0,0 +1,178 @@
+class OmiOms
+ require 'json'
+ require_relative 'oms_common'
+ require_relative 'omslog'
+
+ attr_reader :specific_mapping
+
+ def initialize(object_name, instance_regex, counter_name_regex, omi_mapping_path, omi_interface=nil, common=nil)
+ @object_name = object_name
+ @counter_name_regex = counter_name_regex
+ @instance_regex = instance_regex
+ @omi_mapping_path = omi_mapping_path
+
+ @specific_mapping = get_specific_mapping
+ if @specific_mapping
+ @conf_error = false
+ @instance_property = @specific_mapping["InstanceProperty"]
+ @cim_to_oms = get_cim_to_oms_mappings(@specific_mapping["CimProperties"])
+ else
+ @conf_error = true
+ return
+ end
+
+ if common == nil
+ common = OMS::Common
+ end
+
+ @hostname = common.get_hostname or "Unknown host"
+
+ if omi_interface
+ @omi_interface = omi_interface
+ else
+ require_relative 'Libomi'
+ @omi_interface = Libomi::OMIInterface.new
+ end
+ @omi_interface.connect
+
+ end
+
+ def get_specific_mapping
+ begin
+ file = File.read(@omi_mapping_path)
+ rescue => error
+ $log.error "Unable to read file #{@omi_mapping_path}"
+ return
+ end
+
+ begin
+ mapping = JSON.parse(file)
+ rescue => error
+ $log.error "Error parsing file #{@omi_mapping_path} : #{error}"
+ return
+ end
+
+ specific_mapping = nil
+
+ mapping.each { |class_info|
+ if class_info["ObjectName"] == @object_name
+ specific_mapping = class_info
+ break
+ end
+ }
+
+ if specific_mapping == nil
+ $log.error "Could not find ObjectName '#{@object_name}' in #{@omi_mapping_path}"
+ return
+ end
+ return specific_mapping
+ end
+
+ def get_cim_to_oms_mappings(cimproperties)
+ cim_to_oms = {}
+ cimproperties.each { |maps|
+ cim_name = maps["CimPropertyName"]
+ oms_name = maps["CounterName"]
+ # Map cim_name to both CounterName and DisplayName
+ if maps.has_key?("DisplayName")
+ display_counter_name = maps["DisplayName"]
+ cim_to_oms[cim_name] = [oms_name, display_counter_name]
+ else
+ cim_to_oms[cim_name] = oms_name
+ end
+ }
+ return cim_to_oms
+ end
+
+ def omi_to_oms_instance(omi_instance, timestamp, wlm_enabled=false)
+ oms_instance = {}
+ oms_instance["Timestamp"] = timestamp
+ if(!wlm_enabled)
+ oms_instance["Host"] = @hostname
+ else
+ oms_instance["Host"] = OMS::Common.get_fully_qualified_domain_name
+ oms_instance[@object_name] = omi_instance[@specific_mapping["InstanceProperty"]]
+ end
+ oms_instance["ObjectName"] = @object_name
+ # get the specific instance value given the instance property name (i.e. Name, InstanceId, etc. )
+ oms_instance["InstanceName"] = omi_instance[@specific_mapping["InstanceProperty"]]
+ oms_instance_collections = []
+
+ # go through each CimProperties in the specific mapping,
+ # if counterName is collected, perform the lookup for the value
+ # else skip to the next property
+
+ # Filter properties. Watch out! We get them as CIM but the regex is with OMS property names
+ omi_instance.each do |property, value|
+ # CimProperty may have a DisplayName attribute; if so, use that as the CounterName
+ if @cim_to_oms[property].is_a?(Array)
+ oms_property_name = @cim_to_oms[property][0]
+ oms_property_display_name = @cim_to_oms[property][1]
+ else
+ oms_property_name = @cim_to_oms[property]
+ oms_property_display_name == nil
+ end
+ begin
+ if /#{@counter_name_regex}/.match(oms_property_name)
+ if value.nil?
+ OMS::Log.warn_once("Dropping null value for counter #{oms_property_name}.")
+ else
+ counter_pair = {}
+ if oms_property_display_name == nil
+ counter_pair["CounterName"] = oms_property_name
+ else
+ counter_pair["CounterName"] = oms_property_display_name
+ end
+ counter_pair["Value"] = value
+ oms_instance_collections.push(counter_pair)
+ end
+ end
+ rescue RegexpError => e
+ @conf_error = true
+ $log.error "Regex error on counter_name_regex : #{e}"
+ return
+ end
+ end
+ oms_instance["Collections"] = oms_instance_collections
+ return oms_instance
+ end
+
+ def enumerate(time, data_type="LINUX_PERF_BLOB", ip_name="LogManagement", wlm_enabled=false)
+ return nil if @conf_error
+ namespace = @specific_mapping["Namespace"]
+ cim_class_name = @specific_mapping["CimClassName"]
+ items = [[namespace, cim_class_name]]
+ record_txt = @omi_interface.enumerate(items)
+ instances = JSON.parse record_txt
+
+ # Filter based on instance names
+ begin
+ instances.select!{ |instance|
+ /#{@instance_regex}/.match(instance[@instance_property])
+ }
+ rescue RegexpError => e
+ @conf_error = true
+ $log.error "Regex error on instance_regex : #{e}"
+ return
+ end
+
+ timestamp = OMS::Common.format_time(time)
+ # Convert instances to oms format
+ instances.map!{ |instance|
+ omi_to_oms_instance(instance, timestamp, wlm_enabled)
+ }
+
+ if instances.length > 0
+ wrapper = {
+ "DataType"=>data_type,
+ "IPName"=>ip_name,
+ "DataItems"=>instances
+ }
+ return wrapper
+ end
+ end
+
+ def disconnect
+ @omi_interface.disconnect
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/omslog.rb b/source/plugins/ruby-fluentd4/omslog.rb
new file mode 100644
index 000000000..15c68f11c
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/omslog.rb
@@ -0,0 +1,52 @@
+# TODO: figure out where this file comes from. This copy was copied out of a running agent container, but it doesn't appear in the source anywhere
+
+module OMS
+ class Log
+ require 'set'
+ require 'digest'
+
+ @@error_proc = Proc.new {|message| $log.error message }
+ @@warn_proc = Proc.new {|message| $log.warn message }
+ @@info_proc = Proc.new {|message| $log.info message }
+ @@debug_proc = Proc.new {|message| $log.debug message }
+
+ @@logged_hashes = Set.new
+
+ class << self
+ def error_once(message, tag=nil)
+ log_once(@@error_proc, @@debug_proc, message, tag)
+ end
+
+ def warn_once(message, tag=nil)
+ log_once(@@warn_proc, @@debug_proc, message, tag)
+ end
+
+ def info_once(message, tag=nil)
+ log_once(@@info_proc, @@debug_proc, message, tag)
+ end
+
+ def log_once(first_loglevel_proc, next_loglevel_proc, message, tag=nil)
+ # Will log a message once with the first procedure and subsequently with the second
+ # This allows repeated messages to be ignored by having the second logging function at a lower log level
+ # An optional tag can be used as the message key
+
+ if tag == nil
+ tag = message
+ end
+
+ md5_digest = Digest::MD5.new
+ tag_hash = md5_digest.update(tag).base64digest
+ res = @@logged_hashes.add?(tag_hash)
+
+ if res == nil
+ # The hash was already in the set
+ next_loglevel_proc.call(message)
+ else
+ # First time we see this hash
+ first_loglevel_proc.call(message)
+ end
+ end
+ end # Class methods
+
+ end # Class Log
+end # Module OMS
diff --git a/source/plugins/ruby-fluentd4/out_mdm.rb b/source/plugins/ruby-fluentd4/out_mdm.rb
new file mode 100644
index 000000000..e56c3c20c
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/out_mdm.rb
@@ -0,0 +1,425 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require 'fluent/plugin/output'
+
+module Fluent
+ class OutputMDM < Fluent::BufferedOutput
+ config_param :retry_mdm_post_wait_minutes, :integer
+
+ Fluent::Plugin.register_output("mdm", self)
+
+ def initialize
+ super
+ require "net/http"
+ require "net/https"
+ require "uri"
+ require "yajl/json_gem"
+ require_relative "KubernetesApiClient"
+ require_relative "ApplicationInsightsUtility"
+ require_relative "constants"
+ require_relative "arc_k8s_cluster_identity"
+ require_relative "proxy_utils"
+
+ @@token_resource_url = "https://monitoring.azure.com/"
+ @@grant_type = "client_credentials"
+ @@azure_json_path = "/etc/kubernetes/host/azure.json"
+ @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics"
+ @@aad_token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token"
+
+ # msiEndpoint is the well known endpoint for getting MSI authentications tokens
+ @@msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%{user_assigned_client_id}&resource=%{resource}"
+ @@user_assigned_client_id = ENV["USER_ASSIGNED_IDENTITY_CLIENT_ID"]
+
+ @@plugin_name = "AKSCustomMetricsMDM"
+ @@record_batch_size = 2000 #2600 at times exceeds 1MB, not safe
+
+ @@token_refresh_back_off_interval = 30
+
+ @data_hash = {}
+ @parsed_token_uri = nil
+ @http_client = nil
+ @token_expiry_time = Time.now
+ @cached_access_token = String.new
+ @last_post_attempt_time = Time.now
+ @first_post_attempt_made = false
+ @can_send_data_to_mdm = true
+ @last_telemetry_sent_time = nil
+ # Setting useMsi to false by default
+ @useMsi = false
+ @metrics_flushed_count = 0
+
+ @cluster_identity = nil
+ @isArcK8sCluster = false
+ @get_access_token_backoff_expiry = Time.now
+
+ @mdm_exceptions_hash = {}
+ @mdm_exceptions_count = 0
+ @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
+ end
+
+ def configure(conf)
+ # s = conf.add_element("secondary")
+ # s["type"] = ChunkErrorHandler::SecondaryName
+ super
+ end
+
+ def start
+ super
+ begin
+ aks_resource_id = ENV["AKS_RESOURCE_ID"]
+ aks_region = ENV["AKS_REGION"]
+
+ if aks_resource_id.to_s.empty?
+ @log.info "Environment Variable AKS_RESOURCE_ID is not set.. "
+ @can_send_data_to_mdm = false
+ elsif !aks_resource_id.downcase.include?("/microsoft.containerservice/managedclusters/") && !aks_resource_id.downcase.include?("/microsoft.kubernetes/connectedclusters/")
+ @log.info "MDM Metris not supported for this cluster type resource: #{aks_resource_id}"
+ @can_send_data_to_mdm = false
+ end
+
+ if aks_region.to_s.empty?
+ @log.info "Environment Variable AKS_REGION is not set.. "
+ @can_send_data_to_mdm = false
+ else
+ aks_region = aks_region.gsub(" ", "")
+ end
+
+ if @can_send_data_to_mdm
+ @log.info "MDM Metrics supported in #{aks_region} region"
+
+ if aks_resource_id.downcase.include?("microsoft.kubernetes/connectedclusters")
+ @isArcK8sCluster = true
+ end
+ @@post_request_url = @@post_request_url_template % { aks_region: aks_region, aks_resource_id: aks_resource_id }
+ @post_request_uri = URI.parse(@@post_request_url)
+ if (!!@isArcK8sCluster)
+ proxy = (ProxyUtils.getProxyConfiguration)
+ if proxy.nil? || proxy.empty?
+ @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
+ else
+ @log.info "Proxy configured on this cluster: #{aks_resource_id}"
+ @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass])
+ end
+ else
+ @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
+ end
+ @http_client.use_ssl = true
+ @log.info "POST Request url: #{@@post_request_url}"
+ ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {})
+
+ # arc k8s cluster uses cluster identity
+ if (!!@isArcK8sCluster)
+ @log.info "using cluster identity token since cluster is azure arc k8s cluster"
+ @cluster_identity = ArcK8sClusterIdentity.new
+ @cached_access_token = @cluster_identity.get_cluster_identity_token
+ else
+ # azure json file only used for aks and doesnt exist in non-azure envs
+ file = File.read(@@azure_json_path)
+ @data_hash = JSON.parse(file)
+ # Check to see if SP exists, if it does use SP. Else, use msi
+ sp_client_id = @data_hash["aadClientId"]
+ sp_client_secret = @data_hash["aadClientSecret"]
+
+ if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id.downcase != "msi")
+ @useMsi = false
+ aad_token_url = @@aad_token_url_template % { tenant_id: @data_hash["tenantId"] }
+ @parsed_token_uri = URI.parse(aad_token_url)
+ else
+ @useMsi = true
+ msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url }
+ @parsed_token_uri = URI.parse(msi_endpoint)
+ end
+
+ @cached_access_token = get_access_token
+ end
+ end
+ rescue => e
+ @log.info "exception when initializing out_mdm #{e}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e, { "FeatureArea" => "MDM" })
+ return
+ end
+ end
+
+ # get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired
+ def get_access_token
+ if (Time.now > @get_access_token_backoff_expiry)
+ http_access_token = nil
+ retries = 0
+ begin
+ if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # Refresh token 5 minutes from expiration
+ @log.info "Refreshing access token for out_mdm plugin.."
+
+ if (!!@useMsi)
+ @log.info "Using msi to get the token to post MDM data"
+ ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", {})
+ @log.info "Opening TCP connection"
+ http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false)
+ # http_access_token.use_ssl = false
+ token_request = Net::HTTP::Get.new(@parsed_token_uri.request_uri)
+ token_request["Metadata"] = true
+ else
+ @log.info "Using SP to get the token to post MDM data"
+ ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-SP", {})
+ @log.info "Opening TCP connection"
+ http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => true)
+ # http_access_token.use_ssl = true
+ token_request = Net::HTTP::Post.new(@parsed_token_uri.request_uri)
+ token_request.set_form_data(
+ {
+ "grant_type" => @@grant_type,
+ "client_id" => @data_hash["aadClientId"],
+ "client_secret" => @data_hash["aadClientSecret"],
+ "resource" => @@token_resource_url,
+ }
+ )
+ end
+
+ @log.info "making request to get token.."
+ token_response = http_access_token.request(token_request)
+ # Handle the case where the response is not 200
+ parsed_json = JSON.parse(token_response.body)
+ @token_expiry_time = Time.now + @@token_refresh_back_off_interval * 60 # set the expiry time to be ~ thirty minutes from current time
+ @cached_access_token = parsed_json["access_token"]
+ @log.info "Successfully got access token"
+ end
+ rescue => err
+ @log.info "Exception in get_access_token: #{err}"
+ if (retries < 2)
+ retries += 1
+ @log.info "Retrying request to get token - retry number: #{retries}"
+ sleep(retries)
+ retry
+ else
+ @get_access_token_backoff_expiry = Time.now + @@token_refresh_back_off_interval * 60
+ @log.info "@get_access_token_backoff_expiry set to #{@get_access_token_backoff_expiry}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" })
+ end
+ ensure
+ if http_access_token
+ @log.info "Closing http connection"
+ http_access_token.finish
+ end
+ end
+ end
+ @cached_access_token
+ end
+
+ def write_status_file(success, message)
+ fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status"
+ status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message]
+ begin
+ File.open(fn, "w") { |file| file.write(status) }
+ rescue => e
+ @log.debug "Error:'#{e}'"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e)
+ end
+ end
+
+ # This method is called when an event reaches to Fluentd.
+ # Convert the event to a raw string.
+ def format(tag, time, record)
+ if record != {}
+ @log.trace "Buffering #{tag}"
+ return [tag, record].to_msgpack
+ else
+ return ""
+ end
+ end
+
+ def exception_aggregator(error)
+ begin
+ errorStr = error.to_s
+ if (@mdm_exceptions_hash[errorStr].nil?)
+ @mdm_exceptions_hash[errorStr] = 1
+ else
+ @mdm_exceptions_hash[errorStr] += 1
+ end
+ #Keeping track of all exceptions to send the total in the last flush interval as a metric
+ @mdm_exceptions_count += 1
+ rescue => error
+ @log.info "Error in MDM exception_aggregator method: #{error}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ end
+
+ def flush_mdm_exception_telemetry
+ begin
+ #Flush out exception telemetry as a metric for the last 30 minutes
+ timeDifference = (DateTime.now.to_time.to_i - @mdm_exception_telemetry_time_tracker).abs
+ timeDifferenceInMinutes = timeDifference / 60
+ if (timeDifferenceInMinutes >= Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL)
+ telemetryProperties = {}
+ telemetryProperties["ExceptionsHashForFlushInterval"] = @mdm_exceptions_hash.to_json
+ telemetryProperties["FlushInterval"] = Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL
+ ApplicationInsightsUtility.sendMetricTelemetry(Constants::MDM_EXCEPTION_TELEMETRY_METRIC, @mdm_exceptions_count, telemetryProperties)
+ # Resetting values after flushing
+ @mdm_exceptions_count = 0
+ @mdm_exceptions_hash = {}
+ @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
+ end
+ rescue => error
+ @log.info "Error in flush_mdm_exception_telemetry method: #{error}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(error)
+ end
+ end
+
+ # This method is called every flush interval. Send the buffer chunk to MDM.
+ # 'chunk' is a buffer chunk that includes multiple formatted records
+ def write(chunk)
+ begin
+ # Adding this before trying to flush out metrics, since adding after can lead to metrics never being sent
+ flush_mdm_exception_telemetry
+ if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm
+ post_body = []
+ chunk.msgpack_each { |(tag, record)|
+ post_body.push(record.to_json)
+ }
+ # the limit of the payload is 1MB. Each record is ~300 bytes. using a batch size of 2600, so that
+ # the pay load size becomes approximately 800 Kb.
+ count = post_body.size
+ while count > 0
+ current_batch = post_body.first(@@record_batch_size)
+ post_body = post_body.drop(current_batch.size)
+ count -= current_batch.size
+ send_to_mdm current_batch
+ end
+ else
+ if !@can_send_data_to_mdm
+ @log.info "Cannot send data to MDM since all required conditions were not met"
+ else
+ @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time) / 60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP"
+ end
+ end
+ rescue Exception => e
+ # Adding exceptions to hash to aggregate and send telemetry for all write errors
+ exception_aggregator(e)
+ @log.info "Exception when writing to MDM: #{e}"
+ raise e
+ end
+ end
+
+ def send_to_mdm(post_body)
+ begin
+ if (!!@isArcK8sCluster)
+ if @cluster_identity.nil?
+ @cluster_identity = ArcK8sClusterIdentity.new
+ end
+ access_token = @cluster_identity.get_cluster_identity_token
+ else
+ access_token = get_access_token
+ end
+ request = Net::HTTP::Post.new(@post_request_uri.request_uri)
+ request["Content-Type"] = "application/x-ndjson"
+ request["Authorization"] = "Bearer #{access_token}"
+
+ request.body = post_body.join("\n")
+ @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}"
+ response = @http_client.request(request)
+ response.value # this throws for non 200 HTTP response code
+ @log.info "HTTP Post Response Code : #{response.code}"
+ if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now
+ ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {})
+ @last_telemetry_sent_time = Time.now
+ end
+ rescue Net::HTTPServerException => e
+ if !response.nil? && !response.body.nil? #body will have actual error
+ @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}"
+ else
+ @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}"
+ end
+ @log.debug_backtrace(e.backtrace)
+ if !response.code.empty? && response.code == 403.to_s
+ @log.info "Response Code #{response.code} Updating @last_post_attempt_time"
+ @last_post_attempt_time = Time.now
+ @first_post_attempt_made = true
+ # Not raising exception, as that will cause retries to happen
+ elsif !response.code.empty? && response.code.start_with?("4")
+ # Log 400 errors and continue
+ @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
+ else
+ # raise if the response code is non-400
+ @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
+ raise e
+ end
+ # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes
+ exception_aggregator(e)
+ rescue Errno::ETIMEDOUT => e
+ @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}"
+ @log.debug_backtrace(e.backtrace)
+ raise e
+ rescue Exception => e
+ @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}"
+ @log.debug_backtrace(e.backtrace)
+ raise e
+ end
+ end
+
+ private
+
+ class ChunkErrorHandler
+ include Configurable
+ include PluginId
+ include PluginLoggerMixin
+
+ SecondaryName = "__ChunkErrorHandler__"
+
+ Fluent::Plugin.register_output(SecondaryName, self)
+
+ def initialize
+ @router = nil
+ end
+
+ def secondary_init(primary)
+ @error_handlers = create_error_handlers @router
+ end
+
+ def start
+ # NOP
+ end
+
+ def shutdown
+ # NOP
+ end
+
+ def router=(r)
+ @router = r
+ end
+
+ def write(chunk)
+ chunk.msgpack_each { |(tag, record)|
+ @error_handlers[tag].emit(record)
+ }
+ end
+
+ private
+
+ def create_error_handlers(router)
+ nop_handler = NopErrorHandler.new
+ Hash.new() { |hash, tag|
+ etag = OMS::Common.create_error_tag tag
+ hash[tag] = router.match?(etag) ?
+ ErrorHandler.new(router, etag) :
+ nop_handler
+ }
+ end
+
+ class ErrorHandler
+ def initialize(router, etag)
+ @router = router
+ @etag = etag
+ end
+
+ def emit(record)
+ @router.emit(@etag, Fluent::Engine.now, record)
+ end
+ end
+
+ class NopErrorHandler
+ def emit(record)
+ # NOP
+ end
+ end
+ end
+ end # class OutputMDM
+end # module Fluent
diff --git a/source/plugins/ruby-fluentd4/out_oms.rb b/source/plugins/ruby-fluentd4/out_oms.rb
new file mode 100644
index 000000000..1302a1a00
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/out_oms.rb
@@ -0,0 +1,209 @@
+require 'fluent/plugin/output'
+
+module Fluent
+
+ class OutputOMS < Fluent::BufferedOutput
+
+ Fluent::Plugin.register_output('oms', self)
+
+ # Endpoint URL ex. localhost.local/api/
+
+ def initialize
+ super
+ require 'net/http'
+ require 'net/https'
+ require 'uri'
+ require 'yajl'
+ require 'openssl'
+ require_relative 'omslog'
+ require_relative 'oms_configuration'
+ require_relative 'oms_common'
+ require_relative 'agent_telemetry_script'
+ end
+
+ config_param :omsadmin_conf_path, :string, :default => '/etc/opt/microsoft/omsagent/conf/omsadmin.conf'
+ config_param :cert_path, :string, :default => '/etc/opt/microsoft/omsagent/certs/oms.crt'
+ config_param :key_path, :string, :default => '/etc/opt/microsoft/omsagent/certs/oms.key'
+ config_param :proxy_conf_path, :string, :default => '/etc/opt/microsoft/omsagent/proxy.conf'
+ config_param :compress, :bool, :default => true
+
+ def configure(conf)
+ # s = conf.add_element("secondary")
+ # s["type"] = ChunkErrorHandler::SecondaryName
+ super
+ end
+
+ def start
+ super
+ @proxy_config = OMS::Configuration.get_proxy_config(@proxy_conf_path)
+ end
+
+ def shutdown
+ super
+ end
+
+ def write_status_file(success, message)
+ fn = '/var/opt/microsoft/omsagent/log/ODSIngestion.status'
+ status = '{ "operation": "ODSIngestion", "success": "%s", "message": "%s" }' % [success, message]
+ begin
+ File.open(fn,'w') { |file| file.write(status) }
+ rescue => e
+ @log.debug "Error:'#{e}'"
+ end
+ end
+
+ def handle_record(key, record)
+ @log.trace "Handling record : #{key}"
+ req = OMS::Common.create_ods_request(OMS::Configuration.ods_endpoint.path, record, @compress)
+ unless req.nil?
+ http = OMS::Common.create_ods_http(OMS::Configuration.ods_endpoint, @proxy_config)
+ start = Time.now
+
+ # This method will raise on failure alerting the engine to retry sending this data
+ OMS::Common.start_request(req, http)
+
+ ends = Time.now
+ time = ends - start
+ count = record.has_key?('DataItems') ? record['DataItems'].size : 1
+ @log.debug "Success sending #{key} x #{count} in #{time.round(2)}s"
+ write_status_file("true","Sending success")
+ OMS::Telemetry.push_qos_event(OMS::SEND_BATCH, "true", "", key, record, count, time)
+ return true
+ end
+ rescue OMS::RetryRequestException => e
+ @log.info "Encountered retryable exception. Will retry sending data later."
+ @log.debug "Error:'#{e}'"
+ # Re-raise the exception to inform the fluentd engine we want to retry sending this chunk of data later.
+ write_status_file("false","Retryable exception")
+ raise e.message
+ rescue => e
+ # We encountered something unexpected. We drop the data because
+ # if bad data caused the exception, the engine will continuously
+ # try and fail to resend it. (Infinite failure loop)
+ OMS::Log.error_once("Unexpected exception, dropping data. Error:'#{e}'")
+ write_status_file("false","Unexpected exception")
+ end
+
+ # This method is called when an event reaches to Fluentd.
+ # Convert the event to a raw string.
+ def format(tag, time, record)
+ if record != {}
+ @log.trace "Buffering #{tag}"
+ return [tag, record].to_msgpack
+ else
+ return ""
+ end
+ end
+
+ # This method is called every flush interval. Send the buffer chunk to OMS.
+ # 'chunk' is a buffer chunk that includes multiple formatted
+ # NOTE! This method is called by internal thread, not Fluentd's main thread. So IO wait doesn't affect other plugins.
+ def write(chunk)
+ # Quick exit if we are missing something
+ if !OMS::Configuration.load_configuration(omsadmin_conf_path, cert_path, key_path)
+ raise OMS::RetryRequestException, 'Missing configuration. Make sure to onboard.'
+ end
+
+ # Group records based on their datatype because OMS does not support a single request with multiple datatypes.
+ datatypes = {}
+ unmergable_records = []
+ chunk.msgpack_each {|(tag, record)|
+ if record.has_key?('DataType') and record.has_key?('IPName')
+ key = "#{record['DataType']}.#{record['IPName']}".upcase
+
+ if datatypes.has_key?(key)
+ # Merge instances of the same datatype and ipname together
+ datatypes[key]['DataItems'].concat(record['DataItems'])
+ else
+ if record.has_key?('DataItems')
+ datatypes[key] = record
+ else
+ unmergable_records << [key, record]
+ end
+ end
+ else
+ @log.warn "Missing DataType or IPName field in record from tag '#{tag}'"
+ end
+ }
+
+ datatypes.each do |key, record|
+ handle_record(key, record)
+ end
+
+ @log.trace "Handling #{unmergable_records.size} unmergeable records"
+ unmergable_records.each { |key, record|
+ handle_record(key, record)
+ }
+ end
+
+ private
+
+ class ChunkErrorHandler
+ include Fluent::Configurable
+ include Fluent::PluginId
+ include Fluent::PluginLoggerMixin
+
+ SecondaryName = "__ChunkErrorHandler__"
+
+ Fluent::Plugin.register_output(SecondaryName, self)
+
+ def initialize
+ @router = nil
+ end
+
+ def secondary_init(primary)
+ @error_handlers = create_error_handlers @router
+ end
+
+ def start
+ # NOP
+ end
+
+ def shutdown
+ # NOP
+ end
+
+ def router=(r)
+ @router = r
+ end
+
+ def write(chunk)
+ chunk.msgpack_each {|(tag, record)|
+ @error_handlers[tag].emit(record)
+ }
+ end
+
+ private
+
+ def create_error_handlers(router)
+ nop_handler = NopErrorHandler.new
+ Hash.new() { |hash, tag|
+ etag = OMS::Common.create_error_tag tag
+ hash[tag] = router.match?(etag) ?
+ ErrorHandler.new(router, etag) :
+ nop_handler
+ }
+ end
+
+ class ErrorHandler
+ def initialize(router, etag)
+ @router = router
+ @etag = etag
+ end
+
+ def emit(record)
+ @router.emit(@etag, Fluent::Engine.now, record)
+ end
+ end
+
+ class NopErrorHandler
+ def emit(record)
+ # NOP
+ end
+ end
+
+ end
+
+ end # class OutputOMS
+
+end # module Fluent
\ No newline at end of file
diff --git a/source/plugins/ruby-fluentd4/podinventory_to_mdm.rb b/source/plugins/ruby-fluentd4/podinventory_to_mdm.rb
new file mode 100644
index 000000000..77370e284
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/podinventory_to_mdm.rb
@@ -0,0 +1,321 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+
+# frozen_string_literal: true
+
+require "logger"
+require "yajl/json_gem"
+require "time"
+require_relative "oms_common"
+require_relative "CustomMetricsUtils"
+require_relative "MdmMetricsGenerator"
+# require_relative "mdmMetrics"
+require_relative "constants"
+
+class Inventory2MdmConvertor
+ @@node_count_metric_name = "nodesCount"
+ @@pod_count_metric_name = "podCount"
+ @@pod_inventory_tag = "mdm.kubepodinventory"
+ @@node_inventory_tag = "mdm.kubenodeinventory"
+ @@node_status_ready = "Ready"
+ @@node_status_not_ready = "NotReady"
+ @@oom_killed = "oomkilled"
+ @@metricTelemetryTimeTracker = DateTime.now.to_time.to_i
+
+ @@node_inventory_custom_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/nodes",
+ "dimNames": [
+ "status"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{statusValue}"
+ ],
+ "min": %{node_status_count},
+ "max": %{node_status_count},
+ "sum": %{node_status_count},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ @@pod_inventory_custom_metrics_template = '
+ {
+ "time": "%{timestamp}",
+ "data": {
+ "baseData": {
+ "metric": "%{metricName}",
+ "namespace": "insights.container/pods",
+ "dimNames": [
+ "phase",
+ "Kubernetes namespace",
+ "node",
+ "controllerName"
+ ],
+ "series": [
+ {
+ "dimValues": [
+ "%{phaseDimValue}",
+ "%{namespaceDimValue}",
+ "%{nodeDimValue}",
+ "%{controllerNameDimValue}"
+ ],
+ "min": %{podCountMetricValue},
+ "max": %{podCountMetricValue},
+ "sum": %{podCountMetricValue},
+ "count": 1
+ }
+ ]
+ }
+ }
+ }'
+
+ @@pod_phase_values = ["Running", "Pending", "Succeeded", "Failed", "Unknown"]
+ @process_incoming_stream = false
+
+ def initialize()
+ @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log"
+ @log = Logger.new(@log_path, 1, 5000000)
+ @pod_count_hash = {}
+ @no_phase_dim_values_hash = {}
+ @pod_count_by_phase = {}
+ @pod_uids = {}
+ @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability
+ @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}"
+ @log.debug { "Starting podinventory_to_mdm plugin" }
+ end
+
+ def get_pod_inventory_mdm_records(batch_time)
+ records = []
+ begin
+ if @process_incoming_stream
+ # generate all possible values of non_phase_dim_values X pod Phases and zero-fill the ones that are not already present
+ @no_phase_dim_values_hash.each { |key, value|
+ @@pod_phase_values.each { |phase|
+ pod_key = [key, phase].join("~~")
+ if !@pod_count_hash.key?(pod_key)
+ @pod_count_hash[pod_key] = 0
+ else
+ next
+ end
+ }
+ }
+ @pod_count_hash.each { |key, value|
+ key_elements = key.split("~~")
+ if key_elements.length != 4
+ next
+ end
+
+ # get dimension values by key
+ podNodeDimValue = key_elements[0]
+ podNamespaceDimValue = key_elements[1]
+ podControllerNameDimValue = key_elements[2]
+ podPhaseDimValue = key_elements[3]
+
+ record = @@pod_inventory_custom_metrics_template % {
+ timestamp: batch_time,
+ metricName: @@pod_count_metric_name,
+ phaseDimValue: podPhaseDimValue,
+ namespaceDimValue: podNamespaceDimValue,
+ nodeDimValue: podNodeDimValue,
+ controllerNameDimValue: podControllerNameDimValue,
+ podCountMetricValue: value,
+ }
+ records.push(JSON.parse(record))
+ }
+
+ #Add pod metric records
+ records = MdmMetricsGenerator.appendAllPodMetrics(records, batch_time)
+
+ #Send telemetry for pod metrics
+ timeDifference = (DateTime.now.to_time.to_i - @@metricTelemetryTimeTracker).abs
+ timeDifferenceInMinutes = timeDifference / 60
+ if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+ MdmMetricsGenerator.flushPodMdmMetricTelemetry
+ @@metricTelemetryTimeTracker = DateTime.now.to_time.to_i
+ end
+
+ # Clearing out all hashes after telemetry is flushed
+ MdmMetricsGenerator.clearPodHashes
+ end
+ rescue Exception => e
+ @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace)
+ return []
+ end
+ if @process_incoming_stream
+ @log.info "Pod Count To Phase #{@pod_count_by_phase} "
+ @log.info "resetting convertor state "
+ @pod_count_hash = {}
+ @no_phase_dim_values_hash = {}
+ @pod_count_by_phase = {}
+ @pod_uids = {}
+ end
+ return records
+ end
+
+ # Check if container was terminated in the last 5 minutes
+ def is_container_terminated_recently(finishedTime)
+ begin
+ if !finishedTime.nil? && !finishedTime.empty?
+ finishedTimeParsed = Time.parse(finishedTime)
+ if ((Time.now - finishedTimeParsed) / 60) < Constants::CONTAINER_TERMINATED_RECENTLY_IN_MINUTES
+ return true
+ end
+ end
+ rescue => errorStr
+ @log.warn("Exception in check_if_terminated_recently: #{errorStr}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ return false
+ end
+
+ def process_record_for_oom_killed_metric(podControllerNameDimValue, podNamespaceDimValue, finishedTime)
+ if @process_incoming_stream
+ begin
+ @log.info "in process_record_for_oom_killed_metric..."
+
+ # Send OOM Killed state for container only if it terminated in the last 5 minutes, we dont want to keep sending this count forever
+ if is_container_terminated_recently(finishedTime)
+ if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
+ podControllerNameDimValue = "No Controller"
+ end
+ MdmMetricsGenerator.generateOOMKilledContainerMetrics(podControllerNameDimValue,
+ podNamespaceDimValue)
+ end
+ rescue => errorStr
+ @log.warn("Exception in process_record_for_oom_killed_metric: #{errorStr}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ end
+
+ def process_record_for_container_restarts_metric(podControllerNameDimValue, podNamespaceDimValue, finishedTime)
+ if @process_incoming_stream
+ begin
+ @log.info "in process_record_for_container_restarts_metric..."
+
+ # Send OOM Killed state for container only if it terminated in the last 5 minutes, we dont want to keep sending this count forever
+ if is_container_terminated_recently(finishedTime)
+ if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
+ podControllerNameDimValue = "No Controller"
+ end
+ MdmMetricsGenerator.generateRestartingContainersMetrics(podControllerNameDimValue,
+ podNamespaceDimValue)
+ end
+ rescue => errorStr
+ @log.warn("Exception in process_record_for_container_restarts_metric: #{errorStr}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ end
+
+ def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podStatusConditions)
+ if @process_incoming_stream
+ begin
+ @log.info "in process_record_for_pods_ready_metric..."
+ if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
+ podControllerNameDimValue = "No Controller"
+ end
+ podReadyCondition = false
+ if !podStatusConditions.nil? && !podStatusConditions.empty?
+ podStatusConditions.each do |condition|
+ if condition["type"] == "Ready"
+ if condition["status"].downcase == "true"
+ podReadyCondition = true
+ end
+ break #Exit the for loop since we found the ready condition
+ end
+ end
+ end
+ MdmMetricsGenerator.generatePodReadyMetrics(podControllerNameDimValue,
+ podNamespaceDimValue, podReadyCondition)
+ rescue => errorStr
+ @log.warn("Exception in process_record_for_pods_ready_metric: #{errorStr}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ end
+
+ # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm
+ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNamespaceDimValue, containerStatus)
+ if @process_incoming_stream
+ begin
+ @log.info "in process_record_for_terminated_job_metric..."
+ if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
+ podControllerNameDimValue = "No Controller"
+ end
+ if !containerStatus.keys[0].nil? && containerStatus.keys[0].downcase == Constants::CONTAINER_STATE_TERMINATED
+ containerTerminatedReason = containerStatus["terminated"]["reason"]
+ if !containerTerminatedReason.nil? && containerTerminatedReason.downcase == Constants::CONTAINER_TERMINATION_REASON_COMPLETED
+ containerFinishedTime = containerStatus["terminated"]["finishedAt"]
+ if !containerFinishedTime.nil? && !containerFinishedTime.empty?
+ finishedTimeParsed = Time.parse(containerFinishedTime)
+ # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES
+ if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES
+ MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue,
+ podNamespaceDimValue)
+ end
+ end
+ end
+ end
+ rescue => errorStr
+ @log.warn("Exception in process_record_for_terminated_job: #{errorStr}")
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ end
+
+ def process_pod_inventory_record(record)
+ if @process_incoming_stream
+ begin
+ records = []
+
+ podUid = record["DataItems"][0]["PodUid"]
+ if @pod_uids.key?(podUid)
+ return
+ end
+
+ @pod_uids[podUid] = true
+ podPhaseDimValue = record["DataItems"][0]["PodStatus"]
+ podNamespaceDimValue = record["DataItems"][0]["Namespace"]
+ podControllerNameDimValue = record["DataItems"][0]["ControllerName"]
+ podNodeDimValue = record["DataItems"][0]["Computer"]
+
+ if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
+ podControllerNameDimValue = "No Controller"
+ end
+
+ if podNodeDimValue.empty? && podPhaseDimValue.downcase == "pending"
+ podNodeDimValue = "unscheduled"
+ elsif podNodeDimValue.empty?
+ podNodeDimValue = "unknown"
+ end
+
+ # group by distinct dimension values
+ pod_key = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue, podPhaseDimValue].join("~~")
+
+ @pod_count_by_phase[podPhaseDimValue] = @pod_count_by_phase.key?(podPhaseDimValue) ? @pod_count_by_phase[podPhaseDimValue] + 1 : 1
+ @pod_count_hash[pod_key] = @pod_count_hash.key?(pod_key) ? @pod_count_hash[pod_key] + 1 : 1
+
+ # Collect all possible combinations of dimension values other than pod phase
+ key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join("~~")
+ if @no_phase_dim_values_hash.key?(key_without_phase_dim_value)
+ return
+ else
+ @no_phase_dim_values_hash[key_without_phase_dim_value] = true
+ end
+ rescue Exception => e
+ @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace)
+ end
+ end
+ end
+end
diff --git a/source/plugins/ruby-fluentd4/proxy_utils.rb b/source/plugins/ruby-fluentd4/proxy_utils.rb
new file mode 100644
index 000000000..1566fe4e9
--- /dev/null
+++ b/source/plugins/ruby-fluentd4/proxy_utils.rb
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+class ProxyUtils
+ class << self
+ def getProxyConfiguration()
+ omsproxy_secret_path = "/etc/omsagent-secret/PROXY"
+ if !File.exist?(omsproxy_secret_path)
+ return {}
+ end
+
+ begin
+ proxy_config = parseProxyConfiguration(File.read(omsproxy_secret_path))
+ rescue SystemCallError # Error::ENOENT
+ return {}
+ end
+
+ if proxy_config.nil?
+ $log.warn("Failed to parse the proxy configuration in '#{omsproxy_secret_path}'")
+ return {}
+ end
+
+ return proxy_config
+ end
+
+ def parseProxyConfiguration(proxy_conf_str)
+ # Remove the http(s) protocol
+ proxy_conf_str = proxy_conf_str.gsub(/^(https?:\/\/)?/, "")
+
+ # Check for unsupported protocol
+ if proxy_conf_str[/^[a-z]+:\/\//]
+ return nil
+ end
+
+ re = /^(?:(?[^:]+):(?[^@]+)@)?(?[^:@]+)(?::(?\d+))?$/
+ matches = re.match(proxy_conf_str)
+ if matches.nil? or matches[:addr].nil?
+ return nil
+ end
+ # Convert nammed matches to a hash
+ Hash[ matches.names.map{ |name| name.to_sym}.zip( matches.captures ) ]
+ end
+ end
+end
\ No newline at end of file