diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 53040e2f9..5b3837748 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -1,99 +1,80 @@
- #Kubernetes pod inventory
-
- @type kube_podinventory
- tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB
- run_interval 60
- @log_level debug
-
-
- #Kubernetes Persistent Volume inventory
-
- @type kube_pvinventory
- tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB
- run_interval 60
- @log_level debug
-
-
- #Kubernetes events
-
- @type kube_events
- tag oneagent.containerInsights.KUBE_EVENTS_BLOB
- run_interval 60
- @log_level debug
-
-
- #Kubernetes Nodes
-
- @type kube_nodes
- tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB
- run_interval 60
- @log_level debug
-
-
- #cadvisor perf- Windows nodes
-
- @type win_cadvisor_perf
- tag oneagent.containerInsights.LINUX_PERF_BLOB
- run_interval 60
- @log_level debug
-
-
- #Kubernetes object state - deployments
-
- @type kubestate_deployments
- tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
- run_interval 60
- @log_level debug
-
+ #fluent forward plugin
+
+ workers "#{ENV['NUM_OF_FLUENTD_WORKERS']}"
+ root_dir /var/opt/microsoft/docker-cimprov/state
+
- #Kubernetes object state - HPA
-
- @type kubestate_hpa
- tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
- run_interval 60
- @log_level debug
-
+ #perf
+
+ @type forward
+ @id out_perf_fwd
+ @log_level debug
+ send_timeout 30
+ connect_timeout 30
+ heartbeat_type none
+
+ host 0.0.0.0
+ port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+
+
+ @type file
+ overflow_action drop_oldest_chunk
+ chunk_limit_size 4m
+ queue_limit_length "#{ENV['FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
+ retry_max_times 10
+ retry_wait 5s
+ retry_max_interval 5m
+ flush_thread_count 5
+
+ keepalive true
+
-
- @type inventory2mdm
- @log_level info
-
-
- #custom_metrics_mdm filter plugin for perf data from windows nodes
+ #custom_metrics_mdm filter plugin for perf data from windows nodes
@type cadvisor2mdm
metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
@log_level info
- #kubepodinventory
-
- @type forward
- @log_level debug
- send_timeout 30
- connect_timeout 30
- heartbeat_type none
-
- host 0.0.0.0
- port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-
+ #containerinventory for windows containers
+
+ @type forward
+ @id out_ci_fwd
+ @log_level debug
+ send_timeout 30
+ connect_timeout 30
+ heartbeat_type none
+
+ host 0.0.0.0
+ port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+
@type file
- path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
- keepalive true
+ keepalive true
- #kubepvinventory
-
+
+
+ #Kubernetes pod inventory
+
+ @type kube_podinventory
+ tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB
+ run_interval 60
+ @log_level debug
+
+
+ #kubepodinventory
+
@type forward
@log_level debug
send_timeout 30
@@ -105,22 +86,21 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
- keepalive true
-
+ keepalive true
+
- #InsightsMetrics
- #kubestate
-
+ #kubeservices
+
@type forward
@log_level debug
send_timeout 30
@@ -132,21 +112,30 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 2
- keepalive true
+ keepalive true
+
+
+ #Kubernetes Nodes
+
+ @type kube_nodes
+ tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB
+ run_interval 60
+ @log_level debug
+
- #kubeevents
-
+ #containernodeinventory
+
@type forward
@log_level debug
send_timeout 30
@@ -158,21 +147,26 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 3
keepalive true
-
- #kubeservices
-
+
+
+ @type inventory2mdm
+ @log_level info
+
+
+ #kubenodeinventory
+
@type forward
@log_level debug
send_timeout 30
@@ -184,47 +178,49 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 2
+ flush_thread_count 5
- keepalive true
-
+ keepalive true
+
- #kubenodeinventory
-
- @type forward
+
+ @type mdm
+ @id out_mdm_nodeinventory
@log_level debug
- send_timeout 30
- connect_timeout 30
- heartbeat_type none
-
- host 0.0.0.0
- port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-
@type file
- path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
- keepalive true
+ retry_mdm_post_wait_minutes 30
+
+
+ #Kubernetes events
+
+ @type kube_events
+ tag oneagent.containerInsights.KUBE_EVENTS_BLOB
+ run_interval 60
+ @log_level debug
+
- #containernodeinventory
-
+ #kubeevents
+
@type forward
@log_level debug
send_timeout 30
@@ -236,47 +232,90 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 3
+ flush_thread_count 5
- keepalive true
+ keepalive true
+
+
+ #Kubernetes podmdm inventory
+
+ @type kube_podmdminventory
+ run_interval 60
+ @log_level debug
+
- #containerinventory for windows containers
-
- @type forward
- @log_level debug
- send_timeout 30
- connect_timeout 30
- heartbeat_type none
-
- host 0.0.0.0
- port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-
+
+ @type mdm
+ @id out_mdm_podinventory
+ @log_level debug
@type file
- path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count "#{ENV['FLUENTD_MDM_FLUSH_THREAD_COUNT']}"
- keepalive true
-
+ retry_mdm_post_wait_minutes 30
+
+
+
+
+ #Kubernetes perf inventory
+
+ @type kube_perfinventory
+ tag oneagent.containerInsights.LINUX_PERF_BLOB
+ run_interval 60
+ @log_level debug
+
+
+ #Kubernetes Persistent Volume inventory
+
+ @type kube_pvinventory
+ tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB
+ run_interval 60
+ @log_level debug
+
- #perf
-
+ #cadvisor perf- Windows nodes
+
+ @type win_cadvisor_perf
+ tag oneagent.containerInsights.LINUX_PERF_BLOB
+ run_interval 60
+ @log_level debug
+
+
+ #Kubernetes object state - deployments
+
+ @type kubestate_deployments
+ tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
+ run_interval 60
+ @log_level debug
+
+
+ #Kubernetes object state - HPA
+
+ @type kubestate_hpa
+ tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
+ run_interval 60
+ @log_level debug
+
+
+ #kubepvinventory
+
@type forward
@log_level debug
send_timeout 30
@@ -288,51 +327,62 @@
@type file
- path /var/opt/microsoft/docker-cimprov/state/perf*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
- keepalive true
+ keepalive true
-
- @type mdm
- @log_level debug
+ #InsightsMetrics
+ #kubestate
+
+ @type forward
+ @log_level debug
+ send_timeout 30
+ connect_timeout 30
+ heartbeat_type none
+
+ host 0.0.0.0
+ port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+
@type file
- path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer
+ path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
- retry_mdm_post_wait_minutes 30
+ keepalive true
@type mdm
+ @id out_mdm_perf
@log_level debug
@type file
path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer
overflow_action drop_oldest_chunk
chunk_limit_size 4m
- queue_limit_length 20
- flush_interval 20s
+ queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+ flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
retry_max_times 10
retry_wait 5s
retry_max_interval 5m
- flush_thread_count 5
+ flush_thread_count 5
retry_mdm_post_wait_minutes 30
+
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index 7dcbde31f..92b494ae3 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -132,6 +132,8 @@ MAINTAINER: 'Microsoft Corporation'
/etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root
/etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root
/etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root
+/etc/fluent/plugin/in_kube_podmdminventory.rb; source/plugins/ruby/in_kube_podmdminventory.rb; 644; root; root
+/etc/fluent/plugin/in_kube_perfinventory.rb; source/plugins/ruby/in_kube_perfinventory.rb; 644; root; root
/etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root
/etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root
/etc/fluent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root
@@ -143,6 +145,7 @@ MAINTAINER: 'Microsoft Corporation'
/etc/fluent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root
/etc/fluent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root
+/etc/fluent/plugin/WatchStream.rb; source/plugins/ruby/WatchStream.rb; 644; root; root
diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
index b5b239af0..ad7452aa5 100644
--- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
@@ -53,6 +53,11 @@ spec:
resources:
{{ toYaml .Values.omsagent.resources.deployment | indent 9 }}
env:
+ - name: NUM_OF_FLUENTD_WORKERS
+ valueFrom:
+ resourceFieldRef:
+ containerName: omsagent
+ resource: limits.cpu
{{- if ne .Values.omsagent.env.clusterId "" }}
- name: AKS_RESOURCE_ID
value: {{ .Values.omsagent.env.clusterId | quote }}
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 3e25fc3a4..1e00457d9 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -23,8 +23,7 @@ waitforlisteneronTCPport() {
if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then
#local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"')
- while true
- do
+ while true; do
if [ $totalsleptsecs -gt $waittimesecs ]; then
echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs"
return 1
@@ -33,7 +32,7 @@ waitforlisteneronTCPport() {
if [ -z "$varlistener" ]; then
#echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..."
sleep $sleepdurationsecs
- totalsleptsecs=$(($totalsleptsecs+1))
+ totalsleptsecs=$(($totalsleptsecs + 1))
else
echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs"
return 0
@@ -65,23 +64,22 @@ checkAgentOnboardingStatus() {
successMessage="Loaded data sources"
failureMessage="Failed to load data sources into config"
fi
- while true
- do
- if [ $totalsleptsecs -gt $waittimesecs ]; then
- echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs"
- return 1
- fi
-
- if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then
- echo "Onboarding success"
- return 0
- elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then
- echo "Onboarding Failure: Reason: Failed to onboard the agent"
- echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion"
- return 1
- fi
- sleep $sleepdurationsecs
- totalsleptsecs=$(($totalsleptsecs+1))
+ while true; do
+ if [ $totalsleptsecs -gt $waittimesecs ]; then
+ echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs"
+ return 1
+ fi
+
+ if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then
+ echo "Onboarding success"
+ return 0
+ elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then
+ echo "Onboarding Failure: Reason: Failed to onboard the agent"
+ echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion"
+ return 1
+ fi
+ sleep $sleepdurationsecs
+ totalsleptsecs=$(($totalsleptsecs + 1))
done
else
echo "${FUNCNAME[0]} called with non-numeric arguments<$2>. Required arguments <#wait-time-in-seconds>"
@@ -90,6 +88,103 @@ checkAgentOnboardingStatus() {
fi
}
+setReplicaSetSpecificConfig() {
+ echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
+ export FLUENTD_FLUSH_INTERVAL="20s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20"
+ export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default
+ case $NUM_OF_FLUENTD_WORKERS in
+ [5-9]|9[0-9]|100)
+ export NUM_OF_FLUENTD_WORKERS=5 # Max is 5 core even if the specified limits more than 5 cores
+ export FLUENTD_POD_INVENTORY_WORKER_ID=4
+ export FLUENTD_NODE_INVENTORY_WORKER_ID=3
+ export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
+ export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
+ export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+ export FLUENTD_FLUSH_INTERVAL="5s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="50"
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="100" # kube perf is high volume so would need large queue limit to avoid data loss
+ export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale
+ export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker
+ ;;
+ 4)
+ export NUM_OF_FLUENTD_WORKERS=4
+ export FLUENTD_POD_INVENTORY_WORKER_ID=3
+ export FLUENTD_NODE_INVENTORY_WORKER_ID=2
+ export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
+ export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+ export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+ export FLUENTD_FLUSH_INTERVAL="10s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="40"
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="80" # kube perf is high volume so would need large queue limit
+ export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale
+ ;;
+ 3)
+ export NUM_OF_FLUENTD_WORKERS=3
+ export FLUENTD_POD_INVENTORY_WORKER_ID=2
+ export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+ export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+ export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+ export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+ export FLUENTD_FLUSH_INTERVAL="15s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="30"
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="60" # kube perf is high volume so would need large queue limit
+ export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale
+ ;;
+ 2)
+ export NUM_OF_FLUENTD_WORKERS=2
+ export FLUENTD_POD_INVENTORY_WORKER_ID=1
+ export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+ export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+ export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+ export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+ export FLUENTD_FLUSH_INTERVAL="20s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="20"
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="40" # kube perf is high volume so would need large queue limit
+ export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale
+ ;;
+
+ *)
+ export NUM_OF_FLUENTD_WORKERS=1
+ export FLUENTD_POD_INVENTORY_WORKER_ID=0
+ export FLUENTD_NODE_INVENTORY_WORKER_ID=0
+ export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+ export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+ export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+ export FLUENTD_FLUSH_INTERVAL="20s"
+ export FLUENTD_QUEUE_LIMIT_LENGTH="20"
+ export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20"
+ ;;
+ esac
+ echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
+ echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
+ echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
+ echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
+ echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
+ echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
+ echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc
+ echo "export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH=$FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+ echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+ echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc
+
+ if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then
+ echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc
+ echo "Configured MDSD Max EPS is: ${MONITORING_MAX_EVENT_RATE}"
+ fi
+
+ source ~/.bashrc
+
+ echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
+ echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
+ echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
+ echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
+ echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
+ echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
+ echo "fluentd kube perf buffer plugin queue length: ${FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH}"
+ echo "fluentd buffer plugin queue length for all other non kube perf plugin: ${FLUENTD_QUEUE_LIMIT_LENGTH}"
+ echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}"
+}
#using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
mkdir -p /var/opt/microsoft/docker-cimprov/state
@@ -98,8 +193,8 @@ mkdir -p /var/opt/microsoft/docker-cimprov/state
inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s'
#Run inotify as a daemon to track changes to the mounted configmap for OSM settings.
-if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) ||
- ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then
+if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) ||
+ ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then
inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s'
fi
@@ -108,58 +203,58 @@ if [ -z $AKS_RESOURCE_ID ]; then
echo "not setting customResourceId"
else
export customResourceId=$AKS_RESOURCE_ID
- echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc
+ echo "export customResourceId=$AKS_RESOURCE_ID" >>~/.bashrc
source ~/.bashrc
echo "customResourceId:$customResourceId"
export customRegion=$AKS_REGION
- echo "export customRegion=$AKS_REGION" >> ~/.bashrc
+ echo "export customRegion=$AKS_REGION" >>~/.bashrc
source ~/.bashrc
echo "customRegion:$customRegion"
fi
#set agent config schema version
-if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then
+if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then
#trim
config_schema_version="$(cat /etc/config/settings/schema-version | xargs)"
#remove all spaces
config_schema_version="${config_schema_version//[[:space:]]/}"
#take first 10 characters
- config_schema_version="$(echo $config_schema_version| cut -c1-10)"
+ config_schema_version="$(echo $config_schema_version | cut -c1-10)"
export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version
- echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc
+ echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >>~/.bashrc
source ~/.bashrc
echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION"
fi
#set agent config file version
-if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then
+if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then
#trim
config_file_version="$(cat /etc/config/settings/config-version | xargs)"
#remove all spaces
config_file_version="${config_file_version//[[:space:]]/}"
#take first 10 characters
- config_file_version="$(echo $config_file_version| cut -c1-10)"
+ config_file_version="$(echo $config_file_version | cut -c1-10)"
export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version
- echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc
+ echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >>~/.bashrc
source ~/.bashrc
echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION"
fi
#set OSM config schema version
-if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) ||
- ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then
- if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then
+if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) ||
+ ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then
+ if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then
#trim
osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)"
#remove all spaces
osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}"
#take first 10 characters
- osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)"
+ osm_config_schema_version="$(echo $osm_config_schema_version | cut -c1-10)"
export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version
- echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc
+ echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >>~/.bashrc
source ~/.bashrc
echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION"
fi
@@ -201,13 +296,13 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
if [ -z "$host" -o -z "$port" ]; then
echo "-e error proxy endpoint should be in this format http(s)://: or http(s)://:@:"
else
- echo "successfully validated provided proxy endpoint is valid and expected format"
+ echo "successfully validated provided proxy endpoint is valid and expected format"
fi
- echo $pwd > /opt/microsoft/docker-cimprov/proxy_password
+ echo $pwd >/opt/microsoft/docker-cimprov/proxy_password
export MDSD_PROXY_MODE=application
- echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc
+ echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >>~/.bashrc
export MDSD_PROXY_ADDRESS=$proto$hostport
echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc
if [ ! -z "$user" -a ! -z "$pwd" ]; then
@@ -231,8 +326,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
fi
else
- echo "Making curl request to oms endpint with domain: $domain"
- curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
+ echo "Making curl request to oms endpint with domain: $domain"
+ curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
fi
if [ $? -ne 0 ]; then
@@ -245,8 +340,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT`
fi
else
- echo "Making curl request to ifconfig.co"
- RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co`
+ echo "Making curl request to ifconfig.co"
+ RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co)
fi
if [ $RET -eq 000 ]; then
echo "-e error Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster"
@@ -261,8 +356,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
fi
else
- echo "ifconfig check succeeded, retrying oms endpoint..."
- curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
+ echo "ifconfig check succeeded, retrying oms endpoint..."
+ curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
fi
if [ $? -ne 0 ]; then
@@ -278,23 +373,22 @@ else
echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check"
fi
-
# Set environment variable for if public cloud by checking the workspace domain.
if [ -z $domain ]; then
- ClOUD_ENVIRONMENT="unknown"
+ ClOUD_ENVIRONMENT="unknown"
elif [ $domain == "opinsights.azure.com" ]; then
- CLOUD_ENVIRONMENT="azurepubliccloud"
+ CLOUD_ENVIRONMENT="azurepubliccloud"
elif [ $domain == "opinsights.azure.cn" ]; then
- CLOUD_ENVIRONMENT="azurechinacloud"
+ CLOUD_ENVIRONMENT="azurechinacloud"
elif [ $domain == "opinsights.azure.us" ]; then
- CLOUD_ENVIRONMENT="azureusgovernmentcloud"
+ CLOUD_ENVIRONMENT="azureusgovernmentcloud"
elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then
- CLOUD_ENVIRONMENT="usnat"
+ CLOUD_ENVIRONMENT="usnat"
elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then
- CLOUD_ENVIRONMENT="ussec"
+ CLOUD_ENVIRONMENT="ussec"
fi
export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT
-echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc
+echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >>~/.bashrc
# Copying over CA certs for airgapped clouds. This is needed for Mariner vs Ubuntu hosts.
# We are unable to tell if the host is Mariner or Ubuntu,
@@ -302,7 +396,7 @@ echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc
# One will have the certs and the other will be empty.
# These need to be copied to a different location for Mariner vs Ubuntu containers.
# OS_ID here is the container distro.
-# Adding Mariner now even though the elif will never currently evaluate.
+# Adding Mariner now even though the elif will never currently evaluate.
if [ $CLOUD_ENVIRONMENT == "usnat" ] || [ $CLOUD_ENVIRONMENT == "ussec" ]; then
OS_ID=$(cat /etc/os-release | grep ^ID= | cut -d '=' -f2 | tr -d '"' | tr -d "'")
if [ $OS_ID == "mariner" ]; then
@@ -322,39 +416,38 @@ fi
#consisten naming conventions with the windows
export DOMAIN=$domain
-echo "export DOMAIN=$DOMAIN" >> ~/.bashrc
+echo "export DOMAIN=$DOMAIN" >>~/.bashrc
export WSID=$workspaceId
-echo "export WSID=$WSID" >> ~/.bashrc
+echo "export WSID=$WSID" >>~/.bashrc
# Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds)
-if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1)
+if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1)
for BACKOFF in {1..4}; do
- KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL )
+ KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL)
# there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted
if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then
break
else
- sleep $((2**$BACKOFF / 4)) # (exponential backoff)
+ sleep $((2 ** $BACKOFF / 4)) # (exponential backoff)
fi
done
# validate that the retrieved data is an instrumentation key
if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then
export APPLICATIONINSIGHTS_AUTH=$(echo $KEY)
- echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc
+ echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >>~/.bashrc
echo "Using cloud-specific instrumentation key"
else
# no ikey can be retrieved. Disable telemetry and continue
export DISABLE_TELEMETRY=true
- echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc
+ echo "export DISABLE_TELEMETRY=true" >>~/.bashrc
echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry"
fi
fi
-
aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode)
export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey
-echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc
+echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >>~/.bashrc
source ~/.bashrc
@@ -363,7 +456,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
/usr/bin/ruby2.7 tomlparser.rb
cat config_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source config_env_var
fi
@@ -399,18 +492,18 @@ fi
if [ ! -e "/etc/config/kube.conf" ]; then
if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
cat defaultpromenvvariables-sidecar | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source defaultpromenvvariables-sidecar
else
cat defaultpromenvvariables | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source defaultpromenvvariables
fi
else
cat defaultpromenvvariables-rs | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source defaultpromenvvariables-rs
fi
@@ -418,7 +511,7 @@ fi
#Sourcing environment variable file if it exists. This file has telemetry and whether kubernetes pods are monitored
if [ -e "telemetry_prom_config_env_var" ]; then
cat telemetry_prom_config_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source telemetry_prom_config_env_var
fi
@@ -431,20 +524,19 @@ if [ ! -e "/etc/config/kube.conf" ]; then
#Sourcing config environment variable file if it exists
if [ -e "side_car_fbit_config_env_var" ]; then
cat side_car_fbit_config_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source side_car_fbit_config_env_var
fi
fi
fi
-
#Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting.
if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
/usr/bin/ruby2.7 tomlparser-mdm-metrics-config.rb
cat config_mdm_metrics_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source config_mdm_metrics_env_var
@@ -452,7 +544,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
/usr/bin/ruby2.7 tomlparser-metric-collection-config.rb
cat config_metric_collection_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source config_metric_collection_env_var
fi
@@ -464,15 +556,15 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus
if [ -e "integration_osm_config_env_var" ]; then
cat integration_osm_config_env_var | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source integration_osm_config_env_var
fi
fi
-# If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it.
-if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) &&
- ( "${TELEMETRY_CUSTOM_PROM_MONITOR_PODS}" == "false" ) &&
+# If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it.
+if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) &&
+ ( "${TELEMETRY_CUSTOM_PROM_MONITOR_PODS}" == "false" ) &&
( "${TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then
setGlobalEnvVar MUTE_PROM_SIDECAR true
else
@@ -498,21 +590,20 @@ fi
export CONTAINER_RUNTIME="containerd"
export NODE_NAME=""
-
if [ "$cAdvisorIsSecure" = true ]; then
echo "Using port 10250"
export IS_SECURE_CADVISOR_PORT=true
- echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc
+ echo "export IS_SECURE_CADVISOR_PORT=true" >>~/.bashrc
export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics"
- echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc
+ echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >>~/.bashrc
echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet"
podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]')
else
echo "Using port 10255"
export IS_SECURE_CADVISOR_PORT=false
- echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc
+ echo "export IS_SECURE_CADVISOR_PORT=false" >>~/.bashrc
export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics"
- echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc
+ echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >>~/.bashrc
echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet"
podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]')
fi
@@ -524,13 +615,13 @@ if [ ! -z "$podWithValidContainerId" ]; then
containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]")
nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]")
# use default container runtime if obtained runtime value is either empty or null
- if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then
+ if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then
echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null"
else
export CONTAINER_RUNTIME=$containerRuntime
fi
- if [ -z "$nodeName" -o "$nodeName" == null ]; then
+ if [ -z "$nodeName" -o "$nodeName" == null ]; then
echo "-e error nodeName in /pods API response is empty"
else
export NODE_NAME=$nodeName
@@ -540,21 +631,21 @@ else
fi
echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME
-echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc
+echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >>~/.bashrc
export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total"
-echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >>~/.bashrc
export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total"
-echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >>~/.bashrc
# default to docker metrics
export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations"
export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors"
if [ "$CONTAINER_RUNTIME" != "docker" ]; then
- # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18
- export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations"
- export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors"
+ # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18
+ export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations"
+ export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors"
fi
echo "set caps for ruby process to read container env from proc"
@@ -564,7 +655,7 @@ echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIO
source ~/.bashrc
-echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname
+echo $NODE_NAME >/var/opt/microsoft/docker-cimprov/state/containerhostname
#check if file was written successfully.
cat /var/opt/microsoft/docker-cimprov/state/containerhostname
@@ -577,16 +668,20 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}'
DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}')
echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION"
export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION
-echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc
+echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc
+if [ "${CONTROLLER_TYPE}" == "ReplicaSet" ]; then
+ echo "*** set applicable replicaset config ***"
+ setReplicaSetSpecificConfig
+fi
#skip imds lookup since not used either legacy or aad msi auth path
export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true"
-echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc
+echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc
# this used by mdsd to determine cloud specific LA endpoints
export OMS_TLD=$domain
-echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc
+echo "export OMS_TLD=$OMS_TLD" >>~/.bashrc
cat /etc/mdsd.d/envmdsd | while read line; do
- echo $line >> ~/.bashrc
+ echo $line >>~/.bashrc
done
source /etc/mdsd.d/envmdsd
MDSD_AAD_MSI_AUTH_ARGS=""
@@ -650,25 +745,25 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
echo "not starting mdsd (no metrics to scrape since MUTE_PROM_SIDECAR is true)"
fi
else
- echo "starting mdsd mode in main container..."
- # add -T 0xFFFF for full traces
- mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>> /dev/null &
+ echo "starting mdsd in main container..."
+ # add -T 0xFFFF for full traces
+ mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null &
fi
# Set up a cron job for logrotation
if [ ! -f /etc/cron.d/ci-agent ]; then
- echo "setting up cronjob for ci agent log rotation"
- echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" > /etc/cron.d/ci-agent
+ echo "setting up cronjob for ci agent log rotation"
+ echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" >/etc/cron.d/ci-agent
fi
# no dependency on fluentd for prometheus side car container
if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
if [ ! -e "/etc/config/kube.conf" ]; then
- echo "*** starting fluentd v1 in daemonset"
- fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
+ echo "*** starting fluentd v1 in daemonset"
+ fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
else
- echo "*** starting fluentd v1 in replicaset"
- fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
+ echo "*** starting fluentd v1 in replicaset"
+ fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
fi
fi
@@ -699,13 +794,13 @@ if [ ! -e "/etc/config/kube.conf" ]; then
fi
else
if [ -e "/opt/telegraf-test-rs.conf" ]; then
- echo "****************Start Telegraf in Test Mode**************************"
- /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test
- if [ $? -eq 0 ]; then
- mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf"
- echo "Moving test conf file to telegraf replicaset conf since test run succeeded"
- fi
- echo "****************End Telegraf Run in Test Mode**************************"
+ echo "****************Start Telegraf in Test Mode**************************"
+ /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test
+ if [ $? -eq 0 ]; then
+ mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf"
+ echo "Moving test conf file to telegraf replicaset conf since test run succeeded"
+ fi
+ echo "****************End Telegraf Run in Test Mode**************************"
fi
fi
@@ -753,15 +848,15 @@ else
fi
export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id
-echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc
+echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >>~/.bashrc
export TELEMETRY_AKS_REGION=$telemetry_aks_region
-echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc
+echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >>~/.bashrc
export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name
-echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc
+echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >>~/.bashrc
export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name
-echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc
+echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >>~/.bashrc
export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type
-echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc
+echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >>~/.bashrc
#if [ ! -e "/etc/config/kube.conf" ]; then
# nodename=$(cat /hostfs/etc/hostname)
@@ -773,15 +868,15 @@ echo "replacing nodename in telegraf config"
sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile
export HOST_MOUNT_PREFIX=/hostfs
-echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc
+echo "export HOST_MOUNT_PREFIX=/hostfs" >>~/.bashrc
export HOST_PROC=/hostfs/proc
-echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc
+echo "export HOST_PROC=/hostfs/proc" >>~/.bashrc
export HOST_SYS=/hostfs/sys
-echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc
+echo "export HOST_SYS=/hostfs/sys" >>~/.bashrc
export HOST_ETC=/hostfs/etc
-echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc
+echo "export HOST_ETC=/hostfs/etc" >>~/.bashrc
export HOST_VAR=/hostfs/var
-echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc
+echo "export HOST_VAR=/hostfs/var" >>~/.bashrc
if [ ! -e "/etc/config/kube.conf" ]; then
if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
@@ -830,9 +925,10 @@ else
fi
shutdown() {
- pkill -f mdsd
- }
+ pkill -f mdsd
+}
trap "shutdown" SIGTERM
-sleep inf & wait
+sleep inf &
+wait
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 4e021e1b8..d2d7a0c87 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -403,6 +403,8 @@ spec:
# this used for e2e test and setting this just emits some additional log statements which used for the e2e tests
- name: ISTEST
value: "true"
+ - name: EMIT_CACHE_TELEMETRY
+ value: "false"
#Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters
#- name: ACS_RESOURCE_NAME
# value: "my_acs_cluster_name"
@@ -661,6 +663,13 @@ spec:
cpu: 150m
memory: 250Mi
env:
+ - name: NUM_OF_FLUENTD_WORKERS
+ valueFrom:
+ resourceFieldRef:
+ containerName: omsagent
+ resource: limits.cpu
+ - name: EMIT_CACHE_TELEMETRY
+ value: "false" # enable only debug or test purpose and disable for prod
- name: AKS_RESOURCE_ID
value: "VALUE_AKS_RESOURCE_ID_VALUE"
- name: AKS_REGION
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 8925248d7..ffd76bfbd 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -11,6 +11,8 @@ class KubernetesApiClient
require_relative "oms_common"
require_relative "constants"
+ require_relative "WatchStream"
+ require_relative "kubernetes_container_inventory"
@@ApiVersion = "v1"
@@ApiVersionApps = "v1"
@@ -35,8 +37,6 @@ class KubernetesApiClient
@Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
@@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token"
@@TokenStr = nil
- @@NodeMetrics = Hash.new
- @@WinNodeArray = []
@@telemetryTimeTracker = DateTime.now.to_time.to_i
@@resourceLimitsTelemetryHash = {}
@@ -75,6 +75,39 @@ def getKubeResourceInfo(resource, api_group: nil)
return response
end
+ def getKubeResourceInfoV2(resource, api_group: nil)
+ headers = {}
+ response = nil
+ responseCode = nil
+ @Log.info "Getting Kube resource: #{resource}"
+ begin
+ resourceUri = getResourceUri(resource, api_group)
+ if !resourceUri.nil?
+ uri = URI.parse(resourceUri)
+ if !File.exist?(@@CaFile)
+ raise "#{@@CaFile} doesnt exist"
+ else
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http|
+ kubeApiRequest = Net::HTTP::Get.new(uri.request_uri)
+ kubeApiRequest["Authorization"] = "Bearer " + getTokenStr
+ @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+ response = http.request(kubeApiRequest)
+ responseCode = response.code
+ @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}")
+ end
+ if (!response.nil?)
+ if (!response.body.nil? && response.body.empty?)
+ @Log.warn("KubernetesAPIClient::getKubeResourceInfoV2 : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}")
+ end
+ end
+ return responseCode, response
+ end
+
def getTokenStr
return @@TokenStr if !@@TokenStr.nil?
begin
@@ -88,7 +121,7 @@ def getTokenStr
end
end
- def getClusterRegion(env=ENV)
+ def getClusterRegion(env = ENV)
if env["AKS_REGION"]
return env["AKS_REGION"]
else
@@ -97,7 +130,7 @@ def getClusterRegion(env=ENV)
end
end
- def getResourceUri(resource, api_group, env=ENV)
+ def getResourceUri(resource, api_group, env = ENV)
begin
if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"]
if api_group.nil?
@@ -114,7 +147,7 @@ def getResourceUri(resource, api_group, env=ENV)
end
end
- def getClusterName(env=ENV)
+ def getClusterName(env = ENV)
return @@ClusterName if !@@ClusterName.nil?
@@ClusterName = "None"
begin
@@ -148,7 +181,7 @@ def getClusterName(env=ENV)
return @@ClusterName
end
- def getClusterId(env=ENV)
+ def getClusterId(env = ENV)
return @@ClusterId if !@@ClusterId.nil?
#By default initialize ClusterId to ClusterName.
# In ACS/On-prem, we need to figure out how we can generate ClusterId
@@ -292,8 +325,6 @@ def getWindowsNodes
resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows")
nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body)
@Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api"
- # Resetting the windows node cache
- @@WinNodeArray.clear
if (!nodeInventory.empty?)
nodeInventory["items"].each do |item|
# check for windows operating system in node metadata
@@ -303,11 +334,6 @@ def getWindowsNodes
if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
- # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data
- # to get images and image tags for containers in windows nodes
- if !nodeMetadata.nil? && !nodeMetadata["name"].nil?
- @@WinNodeArray.push(nodeMetadata["name"])
- end
nodeStatusAddresses = nodeStatus["addresses"]
if !nodeStatusAddresses.nil?
nodeStatusAddresses.each do |address|
@@ -327,7 +353,33 @@ def getWindowsNodes
end
def getWindowsNodesArray
- return @@WinNodeArray
+ winNodeArray = []
+ begin
+ # get only windows nodes
+ resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows")
+ nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body)
+ @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api"
+ if (!nodeInventory.empty?)
+ nodeInventory["items"].each do |item|
+ # check for windows operating system in node metadata
+ nodeStatus = item["status"]
+ nodeMetadata = item["metadata"]
+ if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
+ operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
+ if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
+ # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data
+ # to get images and image tags for containers in windows nodes
+ if !nodeMetadata.nil? && !nodeMetadata["name"].nil?
+ winNodeArray.push(nodeMetadata["name"])
+ end
+ end
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("KubernetesApiClient::getWindowsNodesArray:failed with an error: #{error}")
+ end
+ return winNodeArray
end
def getContainerIDs(namespace)
@@ -409,7 +461,7 @@ def getPodUid(podNameSpace, podMetadata)
return podUid
end
- def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
clusterId = getClusterId
@@ -456,19 +508,16 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
metricCollection = {}
metricCollection["CounterName"] = metricNametoReturn
metricCollection["Value"] = metricValue
-
+
metricProps["json_Collections"] = []
- metricCollections = []
- metricCollections.push(metricCollection)
+ metricCollections = []
+ metricCollections.push(metricCollection)
metricProps["json_Collections"] = metricCollections.to_json
- metricItems.push(metricProps)
+ metricItems.push(metricProps)
#No container level limit for the given metric, so default to node level limit
else
- nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
- if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey))
- metricValue = @@NodeMetrics[nodeMetricsHashKey]
- #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ")
-
+ if (metricCategory == "limits" && !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect))
+ metricValue = nodeAllocatableRecord[metricNameToCollect]
metricProps = {}
metricProps["Timestamp"] = metricTime
metricProps["Host"] = nodeName
@@ -481,10 +530,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
metricCollection["CounterName"] = metricNametoReturn
metricCollection["Value"] = metricValue
metricProps["json_Collections"] = []
- metricCollections = []
- metricCollections.push(metricCollection)
+ metricCollections = []
+ metricCollections.push(metricCollection)
metricProps["json_Collections"] = metricCollections.to_json
- metricItems.push(metricProps)
+ metricItems.push(metricProps)
end
end
end
@@ -496,7 +545,7 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
return metricItems
end #getContainerResourceRequestAndLimits
- def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
clusterId = getClusterId
@@ -541,8 +590,9 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory,
else
#No container level limit for the given metric, so default to node level limit for non-gpu metrics
if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
- nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
- metricValue = @@NodeMetrics[nodeMetricsHashKey]
+ if !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect)
+ metricValue = nodeAllocatableRecord[metricNameToCollect]
+ end
end
end
if (!metricValue.nil?)
@@ -615,15 +665,10 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri
metricCollection["CounterName"] = metricNametoReturn
metricCollection["Value"] = metricValue
metricCollections = []
- metricCollections.push(metricCollection)
-
+ metricCollections.push(metricCollection)
+
metricItem["json_Collections"] = []
metricItem["json_Collections"] = metricCollections.to_json
-
- #push node level metrics to a inmem hash so that we can use it looking up at container level.
- #Currently if container level cpu & memory limits are not defined we default to node level limits
- @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
- #@Log.info ("Node metric hash: #{@@NodeMetrics}")
end
rescue => error
@Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
@@ -657,13 +702,6 @@ def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect,
metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect
metricItem["Tags"] = metricTags
-
- #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
- #Currently if container level cpu & memory limits are not defined we default to node level limits
- if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
- @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
- #@Log.info ("Node metric hash: #{@@NodeMetrics}")
- end
end
rescue => error
@Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
@@ -754,6 +792,31 @@ def getMetricNumericValue(metricName, metricVal)
return metricValue
end # getMetricNumericValue
+ def getResourcesAndContinuationTokenV2(uri, api_group: nil)
+ continuationToken = nil
+ resourceInventory = nil
+ responseCode = nil
+ begin
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+ responseCode, resourceInfo = getKubeResourceInfoV2(uri, api_group: api_group)
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+ if !responseCode.nil? && responseCode == "200" && !resourceInfo.nil?
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+ resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body))
+ @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+ resourceInfo = nil
+ end
+ if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?)
+ continuationToken = resourceInventory["metadata"]["continue"]
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getResourcesAndContinuationTokenV2:Failed in get resources for #{uri} and continuation token: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ resourceInventory = nil
+ end
+ return continuationToken, resourceInventory, responseCode
+ end #getResourcesAndContinuationTokenV2
+
def getResourcesAndContinuationToken(uri, api_group: nil)
continuationToken = nil
resourceInventory = nil
@@ -778,7 +841,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil)
return continuationToken, resourceInventory
end #getResourcesAndContinuationToken
- def getKubeAPIServerUrl(env=ENV)
+ def getKubeAPIServerUrl(env = ENV)
apiServerUrl = nil
begin
if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"]
@@ -818,5 +881,518 @@ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
end
return kubeServiceRecords
end
+
+ # Accepts the following options:
+ # :namespace (string) - the namespace of the entity.
+ # :name (string) - the name of the entity to watch.
+ # :label_selector (string) - a selector to restrict the list of returned objects by labels.
+ # :field_selector (string) - a selector to restrict the list of returned objects by fields.
+ # :resource_version (string) - shows changes that occur after passed version of a resource.
+ # :allow_watch_bookmarks (bool) - flag to indicate whether to use bookmark or not.
+ def watch(resource_name, options = {})
+ begin
+ if !File.exist?(@@CaFile)
+ raise "#{@@CaFile} doesnt exist"
+ end
+ http_options = {
+ use_ssl: true,
+ open_timeout: 60,
+ read_timeout: 240, # https://github.com/kubernetes-client/java/issues/1370 https://github.com/kubernetes-client/java/issues/1578
+ ca_file: @@CaFile,
+ verify_mode: OpenSSL::SSL::VERIFY_PEER,
+ }
+ http_headers = {
+ Authorization: "Bearer " + getTokenStr,
+ }
+ ns = ""
+ if !options[:namespace].to_s.empty?
+ ns = "namespaces/#{namespace}/"
+ end
+ path = "watch/#{ns}#{resource_name}"
+ path += "/#{options[:name]}" if options[:name]
+ api_endpoint = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + "#{path}"
+ uri = URI.parse(api_endpoint)
+ params = {}
+ WATCH_ARGUMENTS.each { |k, v| params[k] = options[v] if options[v] }
+ uri.query = URI.encode_www_form(params) if params.any?
+ watcher = WatchStream.new(
+ uri,
+ http_options,
+ http_headers,
+ @Log
+ )
+ return watcher unless block_given?
+ begin
+ watcher.each(&block)
+ ensure
+ watcher.finish if watcher
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::watch:Failed with an error: #{errorStr}"
+ end
+ end
+
+ def getOptimizedItem(resource, resourceItem, isWindowsItem = false)
+ case resource
+ when "pods"
+ return getPodOptimizedItem(resourceItem, isWindowsItem)
+ when "pods-perf"
+ return getPodPerfOptimizedItem(resourceItem)
+ when "nodes"
+ return getNodeOptimizedItem(resourceItem)
+ when "services"
+ return getServiceOptimizedItem(resourceItem)
+ when "deployments"
+ return getDeploymentOptimizedItem(resourceItem)
+ when "horizontalpodautoscalers"
+ return getHpaOptimizedItem(resourceItem)
+ else
+ return resourceItem
+ end
+ end
+
+ def getServiceOptimizedItem(resourceItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+ end
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ item["spec"]["selector"] = []
+ if !resourceItem["spec"]["selector"].nil?
+ item["spec"]["selector"] = resourceItem["spec"]["selector"]
+ end
+ item["spec"]["clusterIP"] = ""
+ if !resourceItem["spec"]["clusterIP"].nil?
+ item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"]
+ end
+ item["spec"]["type"] = ""
+ if !resourceItem["spec"]["type"].nil?
+ item["spec"]["type"] = resourceItem["spec"]["type"]
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getServiceOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def isWindowsNodeItem(nodeResourceItem)
+ isWindowsNodeItem = false
+ begin
+ nodeStatus = nodeResourceItem["status"]
+ if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
+ operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
+ if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
+ isWindowsNodeItem = true
+ end
+ end
+ rescue => errorStr
+ $Log.warn "KubernetesApiClient::::isWindowsNodeItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
+ end
+ return isWindowsNodeItem
+ end
+
+ def getPodPerfOptimizedItem(resourceItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ if !resourceItem["metadata"]["annotations"].nil?
+ item["metadata"]["annotations"] = {}
+ item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"]
+ end
+
+ if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0
+ item["metadata"]["ownerReferences"] = []
+ ownerReference = {}
+ ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"]
+ ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"]
+ item["metadata"]["ownerReferences"].push(ownerReference)
+ end
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+ item["metadata"]["uid"] = resourceItem["metadata"]["uid"]
+ end
+
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ item["spec"]["containers"] = []
+ if !resourceItem["spec"]["containers"].nil?
+ resourceItem["spec"]["containers"].each do |container|
+ currentContainer = {}
+ currentContainer["name"] = container["name"]
+ currentContainer["resources"] = container["resources"]
+ item["spec"]["containers"].push(currentContainer)
+ end
+ end
+ item["spec"]["initContainers"] = []
+ if !resourceItem["spec"]["initContainers"].nil?
+ resourceItem["spec"]["initContainers"].each do |container|
+ currentContainer = {}
+ currentContainer["name"] = container["name"]
+ currentContainer["resources"] = container["resources"]
+ item["spec"]["initContainers"].push(currentContainer)
+ end
+ end
+ item["spec"]["nodeName"] = ""
+ if !resourceItem["spec"]["nodeName"].nil?
+ item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"]
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getPodPerfOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def getPodOptimizedItem(resourceItem, isWindowsPodItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ if !resourceItem["metadata"]["annotations"].nil?
+ item["metadata"]["annotations"] = {}
+ item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"]
+ end
+ if !resourceItem["metadata"]["labels"].nil?
+ item["metadata"]["labels"] = resourceItem["metadata"]["labels"]
+ end
+ if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0
+ item["metadata"]["ownerReferences"] = []
+ ownerReference = {}
+ ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"]
+ ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"]
+ item["metadata"]["ownerReferences"].push(ownerReference)
+ end
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+ item["metadata"]["uid"] = resourceItem["metadata"]["uid"]
+ item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+ if !resourceItem["metadata"]["deletionTimestamp"].nil?
+ item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"]
+ end
+ end
+
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ item["spec"]["containers"] = []
+ item["spec"]["initContainers"] = []
+ isDisableClusterCollectEnvVar = false
+ clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
+ if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0
+ isDisableClusterCollectEnvVar = true
+ end
+
+ # container spec required only for windows container inventory records
+ if isWindowsPodItem
+ if !resourceItem["spec"]["containers"].nil?
+ resourceItem["spec"]["containers"].each do |container|
+ currentContainer = {}
+ currentContainer["name"] = container["name"]
+ currentContainer["resources"] = container["resources"]
+ # fields required for windows containers records
+ if isWindowsPodItem
+ currentContainer["image"] = container["image"]
+ currentContainer["ports"] = container["ports"]
+ currentContainer["command"] = container["command"]
+ currentContainer["env"] = ""
+ if !isDisableClusterCollectEnvVar
+ currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+ end
+ end
+ item["spec"]["containers"].push(currentContainer)
+ end
+ end
+ if !resourceItem["spec"]["initContainers"].nil?
+ resourceItem["spec"]["initContainers"].each do |container|
+ currentContainer = {}
+ currentContainer["name"] = container["name"]
+ currentContainer["resources"] = container["resources"]
+ # fields required for windows containers records
+ if isWindowsPodItem
+ currentContainer["image"] = container["image"]
+ currentContainer["ports"] = container["ports"]
+ currentContainer["command"] = container["command"]
+ currentContainer["env"] = ""
+ if !isDisableClusterCollectEnvVar
+ currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+ end
+ end
+ item["spec"]["initContainers"].push(currentContainer)
+ end
+ end
+ end
+
+ item["spec"]["nodeName"] = ""
+ if !resourceItem["spec"]["nodeName"].nil?
+ item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"]
+ end
+ end
+ item["status"] = {}
+
+ if !resourceItem["status"].nil?
+ if !resourceItem["status"]["startTime"].nil?
+ item["status"]["startTime"] = resourceItem["status"]["startTime"]
+ end
+ if !resourceItem["status"]["reason"].nil?
+ item["status"]["reason"] = resourceItem["status"]["reason"]
+ end
+ if !resourceItem["status"]["podIP"].nil?
+ item["status"]["podIP"] = resourceItem["status"]["podIP"]
+ end
+ if !resourceItem["status"]["phase"].nil?
+ item["status"]["phase"] = resourceItem["status"]["phase"]
+ end
+ if !resourceItem["status"]["conditions"].nil?
+ item["status"]["conditions"] = []
+ resourceItem["status"]["conditions"].each do |condition|
+ currentCondition = {}
+ currentCondition["type"] = condition["type"]
+ currentCondition["status"] = condition["status"]
+ item["status"]["conditions"].push(currentCondition)
+ end
+ end
+ item["status"]["initContainerStatuses"] = []
+ if !resourceItem["status"]["initContainerStatuses"].nil?
+ resourceItem["status"]["initContainerStatuses"].each do |containerStatus|
+ currentContainerStatus = {}
+ currentContainerStatus["containerID"] = containerStatus["containerID"]
+ currentContainerStatus["name"] = containerStatus["name"]
+ currentContainerStatus["restartCount"] = containerStatus["restartCount"]
+ currentContainerStatus["state"] = containerStatus["state"]
+ currentContainerStatus["lastState"] = containerStatus["lastState"]
+ if isWindowsPodItem
+ currentContainerStatus["imageID"] = containerStatus["imageID"]
+ end
+ item["status"]["initContainerStatuses"].push(currentContainerStatus)
+ end
+ end
+ item["status"]["containerStatuses"] = []
+ if !resourceItem["status"]["containerStatuses"].nil?
+ resourceItem["status"]["containerStatuses"].each do |containerStatus|
+ currentContainerStatus = {}
+ currentContainerStatus["containerID"] = containerStatus["containerID"]
+ currentContainerStatus["name"] = containerStatus["name"]
+ currentContainerStatus["restartCount"] = containerStatus["restartCount"]
+ currentContainerStatus["state"] = containerStatus["state"]
+ currentContainerStatus["lastState"] = containerStatus["lastState"]
+ if isWindowsPodItem
+ currentContainerStatus["imageID"] = containerStatus["imageID"]
+ end
+ item["status"]["containerStatuses"].push(currentContainerStatus)
+ end
+ end
+ # this metadata used to identify the pod scheduled onto windows node
+ # so that pod inventory can make decision to extract containerinventory records or not
+ if isWindowsPodItem
+ item["isWindows"] = "true"
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def getNodeAllocatableValues(nodeResourceItem)
+ nodeAllocatable = {}
+ begin
+ if !nodeResourceItem["status"].nil? &&
+ !nodeResourceItem["status"]["allocatable"].nil? &&
+ !nodeResourceItem["status"]["allocatable"].empty?
+ nodeAllocatable["cpu"] = nodeResourceItem["status"]["allocatable"]["cpu"]
+ nodeAllocatable["memory"] = nodeResourceItem["status"]["allocatable"]["memory"]
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getNodeAllocatableValues:Failed with an error : #{errorStr}"
+ end
+ return nodeAllocatable
+ end
+
+ def getNodeOptimizedItem(resourceItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+ if !resourceItem["metadata"]["labels"].nil?
+ item["metadata"]["labels"] = resourceItem["metadata"]["labels"]
+ end
+ end
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty?
+ provider = resourceItem["spec"]["providerID"].split(":")[0]
+ if !provider.nil? && !provider.empty?
+ item["spec"]["providerID"] = provider
+ end
+ end
+ end
+ item["status"] = {}
+ if !resourceItem["status"].nil?
+ item["status"]["conditions"] = []
+ if !resourceItem["status"]["conditions"].nil?
+ resourceItem["status"]["conditions"].each do |condition|
+ currentCondition = {}
+ currentCondition["type"] = condition["type"]
+ currentCondition["status"] = condition["status"]
+ currentCondition["lastTransitionTime"] = condition["lastTransitionTime"]
+ item["status"]["conditions"].push(currentCondition)
+ end
+ end
+
+ nodeInfo = {}
+ if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty?
+ nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"]
+ nodeInfo["kubeProxyVersion"] = resourceItem["status"]["nodeInfo"]["kubeProxyVersion"]
+ nodeInfo["osImage"] = resourceItem["status"]["nodeInfo"]["osImage"]
+ nodeInfo["containerRuntimeVersion"] = resourceItem["status"]["nodeInfo"]["containerRuntimeVersion"]
+ nodeInfo["operatingSystem"] = resourceItem["status"]["nodeInfo"]["operatingSystem"]
+ nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"]
+ end
+ item["status"]["nodeInfo"] = nodeInfo
+
+ nodeAllocatable = {}
+ if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty?
+ nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
+ nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"]
+ if !resourceItem["status"]["allocatable"]["nvidia.com/gpu"].nil?
+ nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
+ end
+ if !resourceItem["status"]["allocatable"]["amd.com/gpu"].nil?
+ nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+ end
+ end
+ item["status"]["allocatable"] = nodeAllocatable
+
+ nodeCapacity = {}
+ if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty?
+ nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"]
+ nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"]
+ if !resourceItem["status"]["capacity"]["nvidia.com/gpu"].nil?
+ nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"]
+ end
+ if !resourceItem["status"]["capacity"]["amd.com/gpu"].nil?
+ nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"]
+ end
+ end
+ item["status"]["capacity"] = nodeCapacity
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def getDeploymentOptimizedItem(resourceItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+ end
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ item["spec"]["strategy"] = {}
+ if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil?
+ item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"]
+ end
+ if !resourceItem["spec"]["replicas"].nil?
+ item["spec"]["replicas"] = resourceItem["spec"]["replicas"]
+ end
+ end
+ item["status"] = {}
+ if !resourceItem["status"].nil?
+ if !resourceItem["status"]["readyReplicas"].nil?
+ item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"]
+ end
+ if !resourceItem["status"]["updatedReplicas"].nil?
+ item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"]
+ end
+ if !resourceItem["status"]["availableReplicas"].nil?
+ item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"]
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getDeploymentOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def getHpaOptimizedItem(resourceItem)
+ item = {}
+ begin
+ item["metadata"] = {}
+ if !resourceItem["metadata"].nil?
+ item["metadata"]["name"] = resourceItem["metadata"]["name"]
+ item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+ item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+ end
+ item["spec"] = {}
+ if !resourceItem["spec"].nil?
+ if !resourceItem["spec"]["minReplicas"].nil?
+ item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"]
+ end
+ if !resourceItem["spec"]["maxReplicas"].nil?
+ item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"]
+ end
+ item["spec"]["scaleTargetRef"] = {}
+ if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil?
+ item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"]
+ end
+ if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil?
+ item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"]
+ end
+ end
+ item["status"] = {}
+ if !resourceItem["status"].nil?
+ if !resourceItem["status"]["currentReplicas"].nil?
+ item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"]
+ end
+ if !resourceItem["status"]["desiredReplicas"].nil?
+ item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"]
+ end
+ if !resourceItem["status"]["lastScaleTime"].nil?
+ item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"]
+ end
+ end
+ rescue => errorStr
+ @Log.warn "KubernetesApiClient::getHpaOptimizedItem:Failed with an error : #{errorStr}"
+ end
+ return item
+ end
+
+ def getPodReadyCondition(podStatusConditions)
+ podReadyCondition = false
+ begin
+ if !podStatusConditions.nil? && !podStatusConditions.empty?
+ podStatusConditions.each do |condition|
+ if condition["type"] == "Ready"
+ if condition["status"].downcase == "true"
+ podReadyCondition = true
+ end
+ break #Exit the for loop since we found the ready condition
+ end
+ end
+ end
+ rescue => err
+ @Log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}"
+ end
+ return podReadyCondition
+ end
+
+ def isEmitCacheTelemetry
+ isEmitCacheTelemtryEnabled = false
+ if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true"
+ isEmitCacheTelemtryEnabled = true
+ end
+ return isEmitCacheTelemtryEnabled
+ end
end
end
diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb
new file mode 100644
index 000000000..6cc850450
--- /dev/null
+++ b/source/plugins/ruby/WatchStream.rb
@@ -0,0 +1,70 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "net/http"
+require "net/https"
+require "yajl/json_gem"
+require "logger"
+require "time"
+
+WATCH_ARGUMENTS = {
+ "labelSelector" => :label_selector,
+ "fieldSelector" => :field_selector,
+ "resourceVersion" => :resource_version,
+ "allowWatchBookmarks" => :allow_watch_bookmarks,
+ "timeoutSeconds" => :timeout_seconds,
+}.freeze
+
+# HTTP Stream used to watch changes on entities
+class WatchStream
+ def initialize(uri, http_options, http_headers, logger)
+ @uri = uri
+ @http_client = nil
+ @http_options = http_options
+ @http_headers = http_headers
+ @logger = logger
+ @path = ""
+ @logger.info "WatchStream::initialize @ #{Time.now.utc.iso8601}"
+ end
+
+ def each
+ @finished = false
+ buffer = +""
+ @logger.info "WatchStream::each:Opening TCP session @ #{Time.now.utc.iso8601}"
+ @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options)
+ if @http_client.nil?
+ raise "WatchStream::each:Failed to create HTTPClient object @ #{Time.now.utc.iso8601}"
+ end
+ @path = @uri.path
+ if @path.nil? || @path.empty?
+ raise "WatchStream::each:URI path should not be empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ if !@uri.query.nil? && !@uri.query.empty?
+ @path += "?" + @uri.query
+ end
+ @logger.info "WatchStream::each:Making GET API call for Watch with path: #{@path} @ #{Time.now.utc.iso8601}"
+ @http_client.request_get(@path, @http_headers) do |response|
+ if !response.nil? && response.code.to_i > 300
+ raise "WatchStream::each:Watch connection of the path: #{@path} failed with an http status code: #{response.code} @ #{Time.now.utc.iso8601}"
+ end
+ response.read_body do |chunk|
+ buffer << chunk
+ while (line = buffer.slice!(/.+\n/))
+ yield(Yajl::Parser.parse(StringIO.new(line.chomp)))
+ end
+ end
+ end
+ rescue => e
+ raise e
+ end
+
+ def finish
+ begin
+ @finished = true
+ @logger.info "WatchStream::finish:Closing HTTP session of the path:#{@path} @ #{Time.now.utc.iso8601}"
+ @http_client.finish if !@http_client.nil? && @http_client.started?
+ rescue => error
+ @logger.warn "WatchStream::finish:Closing of HTTP session of the path: #{@path} failed with an error: #{error} @ #{Time.now.utc.iso8601}"
+ end
+ end
+end
diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index 542f342a6..5f57b465a 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -136,6 +136,12 @@ class Constants
#To evaluate switching to Windows AMA 64KB impacts any existing customers
MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536
+ # FileName for MDM POD Inventory state
+ MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
+ # FileName for NodeAllocatable Records state
+ NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json"
+ # Emit Stream size for Pod MDM metric
+ POD_MDM_EMIT_STREAM_BATCH_SIZE = 5000 # each record is 200 bytes, 5k records ~2MB
# only used in windows in AAD MSI auth mode
IMDS_TOKEN_PATH_FOR_WINDOWS = "c:/etc/imds-access-token/token"
end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 5a52a089b..a3cbb5a85 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -7,11 +7,12 @@ module Fluent::Plugin
class Kube_nodeInventory_Input < Input
Fluent::Plugin.register_input("kube_nodes", self)
- def initialize(kubernetesApiClient = nil,
+ def initialize(is_unit_test_mode = nil, kubernetesApiClient = nil,
applicationInsightsUtility = nil,
extensionUtils = nil,
env = nil,
- telemetry_flush_interval = nil)
+ telemetry_flush_interval = nil,
+ node_items_test_cache = nil)
super()
require "yaml"
@@ -30,6 +31,8 @@ def initialize(kubernetesApiClient = nil,
@extensionUtils = extensionUtils == nil ? ExtensionUtils : extensionUtils
@env = env == nil ? ENV : env
@TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = telemetry_flush_interval == nil ? Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES : telemetry_flush_interval
+ @is_unit_test_mode = is_unit_test_mode == nil ? false : true
+ @node_items_test_cache = node_items_test_cache
# these defines were previously at class scope Moving them into the constructor so that they can be set by unit tests
@@configMapMountPath = "/etc/config/settings/log-data-collection-settings"
@@ -63,6 +66,9 @@ def initialize(kubernetesApiClient = nil,
require_relative "constants"
@NodeCache = NodeStatsCache.new()
+ @watchNodesThread = nil
+ @nodeItemsCache = {}
+ @nodeItemsCacheSizeKB = 0
end
config_param :run_interval, :time, :default => 60
@@ -96,6 +102,8 @@ def start
@finished = false
@condition = ConditionVariable.new
@mutex = Mutex.new
+ @nodeCacheMutex = Mutex.new
+ @watchNodesThread = Thread.new(&method(:watch_nodes))
@thread = Thread.new(&method(:run_periodic))
@@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i
@@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -109,6 +117,7 @@ def shutdown
@condition.signal
}
@thread.join
+ @watchNodesThread.join
super # This super must be at the end of shutdown method
end
end
@@ -147,43 +156,30 @@ def enumerate
# Initializing continuation token to nil
continuationToken = nil
- $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
- # KubernetesApiClient.getNodesResourceUri is a pure function, so call it from the actual module instead of from the mock
- resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
- continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
- $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+ nodeInventory = {}
+ @nodeItemsCacheSizeKB = 0
+ nodeCount = 0
+ nodeInventory["items"] = getNodeItemsFromCache()
nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
@nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
- nodeCount += nodeInventory["items"].length
- $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ nodeCount = nodeInventory["items"].length
+ $log.info("in_kube_nodes::enumerate : number of node items :#{nodeCount} from Kube API @ #{Time.now.utc.iso8601}")
parse_and_emit_records(nodeInventory, batchTime)
else
$log.warn "in_kube_nodes::enumerate:Received empty nodeInventory"
end
-
- #If we receive a continuation token, make calls, process and flush data until we have processed all data
- while (!continuationToken.nil? && !continuationToken.empty?)
- nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i
- continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
- nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
- @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
- if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
- nodeCount += nodeInventory["items"].length
- $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
- parse_and_emit_records(nodeInventory, batchTime)
- else
- $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory"
- end
- end
-
@nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime)
timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs
timeDifferenceInMinutes = timeDifference / 60
if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
@applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {})
@applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {})
- @applicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, {})
+ telemetryProperties = {}
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB
+ end
+ ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties)
@@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
end
# Setting this to nil so that we dont hold memory until GC kicks in
@@ -205,10 +201,19 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
insightsMetricsEventStream = Fluent::MultiEventStream.new
kubePerfEventStream = Fluent::MultiEventStream.new
@@istestvar = @env["ISTEST"]
+ nodeAllocatableRecords = {}
#get node inventory
nodeInventory["items"].each do |item|
# node inventory
nodeInventoryRecord = getNodeInventoryRecord(item, batchTime)
+ # node allocatble records for the kube perf plugin
+ nodeName = item["metadata"]["name"]
+ if !nodeName.nil? && !nodeName.empty?
+ nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+ if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+ nodeAllocatableRecords[nodeName] = nodeAllocatable
+ end
+ end
eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord
if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
$log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
@@ -428,6 +433,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
$log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
end
+ if !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty?
+ nodeAllocatableRecordsJson = nodeAllocatableRecords.to_json
+ if !nodeAllocatableRecordsJson.empty?
+ @log.info "Writing node allocatable records to state file with size(bytes): #{nodeAllocatableRecordsJson.length}"
+ @log.info "in_kube_nodes::parse_and_emit_records:Start:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}"
+ writeNodeAllocatableRecords(nodeAllocatableRecordsJson)
+ @log.info "in_kube_nodes::parse_and_emit_records:End:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}"
+ end
+ nodeAllocatableRecordsJson = nil
+ nodeAllocatableRecords = nil
+ end
rescue => errorStr
$log.warn "Failed to retrieve node inventory: #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
@@ -577,6 +593,211 @@ def getNodeTelemetryProps(item)
end
return properties
end
+
+ def watch_nodes
+ if !@is_unit_test_mode
+ $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ loop do
+ begin
+ if nodesResourceVersion.nil?
+ # clear cache before filling the cache with list
+ @nodeCacheMutex.synchronize {
+ @nodeItemsCache.clear()
+ }
+ continuationToken = nil
+ resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+ $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ if (!nodeInventory.nil? && !nodeInventory.empty?)
+ nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+ if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+ $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ nodeInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+ if !nodeItem.nil? && !nodeItem.empty?
+ @nodeCacheMutex.synchronize {
+ @nodeItemsCache[key] = nodeItem
+ }
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+ end
+ while (!continuationToken.nil? && !continuationToken.empty?)
+ continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
+ break
+ else
+ if (!nodeInventory.nil? && !nodeInventory.empty?)
+ nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+ if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+ $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ nodeInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+ if !nodeItem.nil? && !nodeItem.empty?
+ @nodeCacheMutex.synchronize {
+ @nodeItemsCache[key] = nodeItem
+ }
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ end
+ end
+ if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
+ # https://github.com/kubernetes/kubernetes/issues/74022
+ $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil # for the LIST to happen again
+ sleep(30) # do not overwhelm the api-server if api-server broken
+ else
+ begin
+ $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+ if watcher.nil?
+ $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ watcher.each do |notice|
+ case notice["type"]
+ when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+ item = notice["object"]
+ # extract latest resource version to use for watch reconnect
+ if !item.nil? && !item.empty? &&
+ !item["metadata"].nil? && !item["metadata"].empty? &&
+ !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+ nodesResourceVersion = item["metadata"]["resourceVersion"]
+ # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+ break
+ end
+ if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+ if !nodeItem.nil? && !nodeItem.empty?
+ @nodeCacheMutex.synchronize {
+ @nodeItemsCache[key] = nodeItem
+ }
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ elsif notice["type"] == "DELETED"
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ @nodeCacheMutex.synchronize {
+ @nodeItemsCache.delete(key)
+ }
+ end
+ end
+ when "ERROR"
+ nodesResourceVersion = nil
+ $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+ break
+ else
+ nodesResourceVersion = nil
+ $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+ break
+ end
+ end
+ end
+ rescue Net::ReadTimeout => errorStr
+ ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+ # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ sleep(5) # do not overwhelm the api-server if api-server broken
+ ensure
+ watcher.finish if watcher
+ end
+ end
+ rescue => errorStr
+ $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ end
+ end
+ $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}")
+ end
+ end
+
+ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson)
+ maxRetryCount = 5
+ initialRetryDelaySecs = 0.5
+ retryAttemptCount = 1
+ begin
+ f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w")
+ if !f.nil?
+ isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+ raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock
+ startTime = (Time.now.to_f * 1000).to_i
+ f.write(nodeAllocatbleRecordsJson)
+ f.flush
+ timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+ $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}"
+ else
+ raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write"
+ end
+ rescue => err
+ if retryAttemptCount < maxRetryCount
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ retryAttemptCount = retryAttemptCount + 1
+ sleep (initialRetryDelaySecs * retryAttemptCount)
+ retry
+ end
+ $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err)
+ ensure
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ end
+ end
+
+ def getNodeItemsFromCache()
+ nodeItems = {}
+ if @is_unit_test_mode
+ nodeItems = @node_items_test_cache
+ else
+ @nodeCacheMutex.synchronize {
+ nodeItems = @nodeItemsCache.values.clone
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ @nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
+ end
+ }
+ end
+ return nodeItems
+ end
end # Kube_Node_Input
class NodeStatsCache
diff --git a/source/plugins/ruby/in_kube_nodes_test.rb b/source/plugins/ruby/in_kube_nodes_test.rb
index 8f4984c6c..7d55ea32d 100644
--- a/source/plugins/ruby/in_kube_nodes_test.rb
+++ b/source/plugins/ruby/in_kube_nodes_test.rb
@@ -1,10 +1,10 @@
-require 'minitest/autorun'
+require "minitest/autorun"
-require 'fluent/test'
-require 'fluent/test/driver/input'
-require 'fluent/test/helpers'
+require "fluent/test"
+require "fluent/test/driver/input"
+require "fluent/test/helpers"
-require_relative 'in_kube_nodes.rb'
+require_relative "in_kube_nodes.rb"
class InKubeNodesTests < Minitest::Test
include Fluent::Test::Helpers
@@ -13,20 +13,22 @@ def setup
Fluent::Test.setup
end
- def create_driver(conf = {}, kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil, telemetry_flush_interval=nil)
- Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(kubernetesApiClient=kubernetesApiClient,
- applicationInsightsUtility=applicationInsightsUtility,
- extensionUtils=extensionUtils,
- env=env)).configure(conf)
+ def create_driver(conf = {}, is_unit_test_mode = true, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, telemetry_flush_interval = nil, node_items_test_cache)
+ Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(is_unit_test_mode, kubernetesApiClient = kubernetesApiClient,
+ applicationInsightsUtility = applicationInsightsUtility,
+ extensionUtils = extensionUtils,
+ env = env,
+ telemetry_flush_interval,
+ node_items_test_cache)).configure(conf)
end
# Collection time of scrapped data will always be different. Overwrite it in any records returned by in_kube_ndes.rb
def overwrite_collection_time(data)
if data.key?("CollectionTime")
- data["CollectionTime"] = "~CollectionTime~"
+ data["CollectionTime"] = "~CollectionTime~"
end
if data.key?("Timestamp")
- data["Timestamp"] = "~Timestamp~"
+ data["Timestamp"] = "~Timestamp~"
end
return data
end
@@ -45,41 +47,46 @@ def test_basic_single_node
# isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking
# but it doesn't track how many times isAADMSIAuthMode is called
def extensionUtils.isAADMSIAuthMode
- false
+ false
end
nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes.txt").read)
- kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"])
+ node_items_test_cache = nodes_api_response["items"]
+
kubeApiClient.expect(:getClusterName, "/cluster-name")
kubeApiClient.expect(:getClusterId, "/cluster-id")
+ def appInsightsUtil.sendExceptionTelemetry(exception)
+ if exception.to_s != "undefined method `[]' for nil:NilClass"
+ raise "an unexpected exception has occured"
+ end
+ end
config = "run_interval 999999999" # only run once
- d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env)
+ d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, node_items_test_cache)
d.instance.start
d.instance.enumerate
d.run(timeout: 99999) # Input plugins decide when to run, so we have to give it enough time to run
-
- expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true,
- ["mdm.kubenodeinventory", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true,
- ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"})] => true,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]"})] => true,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]"})] => true,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]"})] => true,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]"})] => true}
+ expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true,
+ ["mdm.kubenodeinventory", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true,
+ ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" })] => true,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]" })] => true,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]" })] => true,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]" })] => true,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]" })] => true }
d.events.each do |tag, time, record|
- cleaned_record = overwrite_collection_time record
- if expected_responses.key?([tag, cleaned_record])
- expected_responses[[tag, cleaned_record]] = true
- else
- assert(false, "got unexpected record")
- end
+ cleaned_record = overwrite_collection_time record
+ if expected_responses.key?([tag, cleaned_record])
+ expected_responses[[tag, cleaned_record]] = true
+ else
+ assert(false, "got unexpected record: #{cleaned_record}")
+ end
end
expected_responses.each do |key, val|
- assert(val, "expected record not emitted: #{key}")
+ assert(val, "expected record not emitted: #{key}")
end
# make sure all mocked methods were called the expected number of times
@@ -104,7 +111,7 @@ def test_malformed_node_spec
# isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking
# but it doesn't track how many times isAADMSIAuthMode is called
def extensionUtils.isAADMSIAuthMode
- false
+ false
end
# Set up the KubernetesApiClient Mock. Note: most of the functions in KubernetesApiClient are pure (access no
@@ -112,16 +119,17 @@ def extensionUtils.isAADMSIAuthMode
# more brittle). Instead, in_kube_nodes bypasses the mock and directly calls these functions in KubernetesApiClient.
# Ideally the pure functions in KubernetesApiClient would be refactored into their own file to reduce confusion.
nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes-malformed.txt").read)
- kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"])
+ node_items_test_cache = nodes_api_response["items"]
+
kubeApiClient.expect(:getClusterName, "/cluster-name")
kubeApiClient.expect(:getClusterName, "/cluster-name")
kubeApiClient.expect(:getClusterId, "/cluster-id")
kubeApiClient.expect(:getClusterId, "/cluster-id")
def appInsightsUtil.sendExceptionTelemetry(exception)
- if exception.to_s != "undefined method `[]' for nil:NilClass"
- raise "an unexpected exception has occured"
- end
+ if exception.to_s != "undefined method `[]' for nil:NilClass"
+ raise "an unexpected exception has occured"
+ end
end
# This test doesn't care if metric telemetry is sent properly. Looking for an unnecessary value would make it needlessly rigid
@@ -130,38 +138,38 @@ def appInsightsUtil.sendMetricTelemetry(a, b, c)
config = "run_interval 999999999" # only run once
- d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env, telemetry_flush_interval=0)
+ d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, telemetry_flush_interval = 0, node_items_test_cache)
d.instance.start
d.instance.enumerate
d.run(timeout: 99999) #TODO: is this necessary?
expected_responses = {
- ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
- ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
- ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]"}] => false,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]"}] => false,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]"}] => false,
- ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]"}] => false,
-
- # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records)
- ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
- ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
- ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false
+ ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+ ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+ ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]" }] => false,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]" }] => false,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]" }] => false,
+ ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]" }] => false,
+
+ # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records)
+ ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+ ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+ ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false,
}
d.events.each do |tag, time, record|
- cleaned_record = overwrite_collection_time record
- if expected_responses.key?([tag, cleaned_record])
- expected_responses[[tag, cleaned_record]] = true
- end
- # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data.
- # we care more that the non-malformed data is still emitted
+ cleaned_record = overwrite_collection_time record
+ if expected_responses.key?([tag, cleaned_record])
+ expected_responses[[tag, cleaned_record]] = true
+ end
+ # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data.
+ # we care more that the non-malformed data is still emitted
end
expected_responses.each do |key, val|
- assert(val, "expected record not emitted: #{key}")
+ assert(val, "expected record not emitted: #{key}")
end
kubeApiClient.verify
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
new file mode 100644
index 000000000..ad8fdbf21
--- /dev/null
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -0,0 +1,433 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "fluent/plugin/input"
+
+module Fluent::Plugin
+ class Kube_PerfInventory_Input < Input
+ Fluent::Plugin.register_input("kube_perfinventory", self)
+
+ def initialize
+ super
+ require "yaml"
+ require "yajl/json_gem"
+ require "yajl"
+ require "set"
+ require "time"
+ require "net/http"
+
+ require_relative "KubernetesApiClient"
+ require_relative "ApplicationInsightsUtility"
+ require_relative "oms_common"
+ require_relative "omslog"
+ require_relative "constants"
+ require_relative "extension_utils"
+
+ # refer tomlparser-agent-config for updating defaults
+ # this configurable via configmap
+ @PODS_CHUNK_SIZE = 0
+ @PODS_EMIT_STREAM_BATCH_SIZE = 0
+
+ @watchPodsThread = nil
+ @podItemsCache = {}
+
+ @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
+ @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
+ end
+
+ config_param :run_interval, :time, :default => 60
+ config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB"
+
+ def configure(conf)
+ super
+ end
+
+ def start
+ if @run_interval
+ super
+ if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0
+ @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i
+ else
+ # this shouldnt happen just setting default here as safe guard
+ $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty")
+ @PODS_CHUNK_SIZE = 1000
+ end
+ $log.info("in_kube_perfinventory::start: PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}")
+
+ if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0
+ @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i
+ else
+ # this shouldnt happen just setting default here as safe guard
+ $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty")
+ @PODS_EMIT_STREAM_BATCH_SIZE = 200
+ end
+ $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
+
+ @finished = false
+ @condition = ConditionVariable.new
+ @mutex = Mutex.new
+ @podCacheMutex = Mutex.new
+ @thread = Thread.new(&method(:run_periodic))
+ @watchPodsThread = Thread.new(&method(:watch_pods))
+ end
+ end
+
+ def shutdown
+ if @run_interval
+ @mutex.synchronize {
+ @finished = true
+ @condition.signal
+ }
+ @thread.join
+ @watchPodsThread.join
+ super # This super must be at the end of shutdown method
+ end
+ end
+
+ def enumerate(podList = nil)
+ begin
+ podInventory = podList
+ @podCount = 0
+ currentTime = Time.now
+ batchTime = currentTime.utc.iso8601
+ if ExtensionUtils.isAADMSIAuthMode()
+ $log.info("in_kube_perfinventory::enumerate: AAD AUTH MSI MODE")
+ if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+ @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
+ end
+ if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+ @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
+ end
+ $log.info("in_kube_perfinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+ $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
+ end
+
+ nodeAllocatableRecords = getNodeAllocatableRecords()
+ # Initializing continuation token to nil
+ continuationToken = nil
+ podItemsCacheSizeKB = 0
+ podInventory = {}
+ @podCacheMutex.synchronize {
+ podInventory["items"] = @podItemsCache.values.clone
+ }
+ if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_perfinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime)
+ else
+ $log.warn "in_kube_perfinventory::enumerate:Received empty podInventory"
+ end
+ # Setting these to nil so that we dont hold memory until GC kicks in
+ podInventory = nil
+ nodeAllocatableRecords = nil
+ rescue => errorStr
+ $log.warn "in_kube_perfinventory::enumerate:Failed in enumerate: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601)
+ currentTime = Time.now
+ emitTime = Fluent::Engine.now
+ kubePerfEventStream = Fluent::MultiEventStream.new
+ insightsMetricsEventStream = Fluent::MultiEventStream.new
+ @@istestvar = ENV["ISTEST"]
+
+ begin #begin block start
+ podInventory["items"].each do |item| #podInventory block start
+ nodeName = ""
+ if !item["spec"]["nodeName"].nil?
+ nodeName = item["spec"]["nodeName"]
+ end
+
+ nodeAllocatableRecord = {}
+ if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
+ nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
+ end
+ #container perf records
+ containerMetricDataItems = []
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
+ containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
+
+ containerMetricDataItems.each do |record|
+ kubePerfEventStream.add(emitTime, record) if record
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_perfinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ kubePerfEventStream = Fluent::MultiEventStream.new
+ end
+
+ # container GPU records
+ containerGPUInsightsMetricsDataItems = []
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+ containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
+ end
+
+ if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+ $log.info("in_kube_perfinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+ insightsMetricsEventStream = Fluent::MultiEventStream.new
+ end
+ end #podInventory block end
+
+ if kubePerfEventStream.count > 0
+ $log.info("in_kube_perfinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+ kubePerfEventStream = nil
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ end
+
+ if insightsMetricsEventStream.count > 0
+ $log.info("in_kube_perfinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+ router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+ $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ insightsMetricsEventStream = nil
+ end
+ rescue => errorStr
+ $log.warn "Failed in parse_and_emit_record kube perf inventory: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end #begin block end
+ end
+
+ def run_periodic
+ @mutex.lock
+ done = @finished
+ @nextTimeToRun = Time.now
+ @waitTimeout = @run_interval
+ until done
+ @nextTimeToRun = @nextTimeToRun + @run_interval
+ @now = Time.now
+ if @nextTimeToRun <= @now
+ @waitTimeout = 1
+ @nextTimeToRun = @now
+ else
+ @waitTimeout = @nextTimeToRun - @now
+ end
+ @condition.wait(@mutex, @waitTimeout)
+ done = @finished
+ @mutex.unlock
+ if !done
+ begin
+ $log.info("in_kube_perfinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}")
+ enumerate
+ $log.info("in_kube_perfinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn "in_kube_perfinventory::run_periodic: enumerate Failed to retrieve perf inventory: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ @mutex.lock
+ end
+ @mutex.unlock
+ end
+
+ def watch_pods
+ $log.info("in_kube_perfinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ loop do
+ begin
+ if podsResourceVersion.nil?
+ # clear cache before filling the cache with list
+ @podCacheMutex.synchronize {
+ @podItemsCache.clear()
+ }
+ continuationToken = nil
+ resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}"
+ $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API:#{resourceUri} @ #{Time.now.utc.iso8601}")
+ if (!podInventory.nil? && !podInventory.empty?)
+ podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+ if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ podInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory"
+ end
+ while (!continuationToken.nil? && !continuationToken.empty?)
+ resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}"
+ continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ break # break, if any of the pagination call failed so that full cache will rebuild with LIST again
+ else
+ if (!podInventory.nil? && !podInventory.empty?)
+ podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+ if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ podInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ end
+ end
+ if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0"
+ # https://github.com/kubernetes/kubernetes/issues/74022
+ $log.warn("in_kube_perfinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil # for the LIST to happen again
+ sleep(30) # do not overwhelm the api-server if api-server broken
+ else
+ begin
+ $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+ if watcher.nil?
+ $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ watcher.each do |notice|
+ case notice["type"]
+ when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+ item = notice["object"]
+ # extract latest resource version to use for watch reconnect
+ if !item.nil? && !item.empty? &&
+ !item["metadata"].nil? && !item["metadata"].empty? &&
+ !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+ podsResourceVersion = item["metadata"]["resourceVersion"]
+ # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+ break
+ end
+ if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ elsif notice["type"] == "DELETED"
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache.delete(key)
+ }
+ end
+ end
+ when "ERROR"
+ podsResourceVersion = nil
+ $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+ break
+ else
+ podsResourceVersion = nil
+ $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+ end
+ end
+ $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}")
+ end
+ rescue Net::ReadTimeout => errorStr
+ ## This expected if there is no activity more than readtimeout value used in the connection
+ # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ sleep(5) # do not overwhelm the api-server if api-server broken
+ ensure
+ watcher.finish if watcher
+ end
+ end
+ rescue => errorStr
+ $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ end
+ end
+ $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}")
+ end
+
+ def getNodeAllocatableRecords()
+ maxRetryCount = 5
+ initialRetryDelaySecs = 0.5
+ retryAttemptCount = 1
+ nodeAllocatableRecords = {}
+ begin
+ f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r")
+ if !f.nil?
+ isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+ raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock
+ startTime = (Time.now.to_f * 1000).to_i
+ nodeAllocatableRecords = Yajl::Parser.parse(f)
+ timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+ $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}"
+ else
+ raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read"
+ end
+ rescue => err
+ if retryAttemptCount < maxRetryCount
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ sleep (initialRetryDelaySecs * retryAttemptCount)
+ retryAttemptCount = retryAttemptCount + 1
+ retry
+ end
+ $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err)
+ ensure
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ end
+ return nodeAllocatableRecords
+ end
+ end # Kube_Pod_Input
+end # module
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index f979ef7c5..bdbc465ec 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -4,12 +4,9 @@
require "fluent/plugin/input"
module Fluent::Plugin
- require_relative "podinventory_to_mdm"
-
class Kube_PodInventory_Input < Input
Fluent::Plugin.register_input("kube_podinventory", self)
- @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
@@hostName = (OMS::Common.get_hostname)
def initialize
@@ -19,6 +16,8 @@ def initialize
require "yajl"
require "set"
require "time"
+ require "net/http"
+ require "fileutils"
require_relative "kubernetes_container_inventory"
require_relative "KubernetesApiClient"
@@ -27,11 +26,13 @@ def initialize
require_relative "omslog"
require_relative "constants"
require_relative "extension_utils"
+ require_relative "CustomMetricsUtils"
# refer tomlparser-agent-config for updating defaults
# this configurable via configmap
@PODS_CHUNK_SIZE = 0
@PODS_EMIT_STREAM_BATCH_SIZE = 0
+ @NODES_CHUNK_SIZE = 0
@podCount = 0
@containerCount = 0
@@ -47,11 +48,18 @@ def initialize
@controllerData = {}
@podInventoryE2EProcessingLatencyMs = 0
@podsAPIE2ELatencyMs = 0
+ @watchPodsThread = nil
+ @podItemsCache = {}
+
+ @watchServicesThread = nil
+ @serviceItemsCache = {}
+
+ @watchWinNodesThread = nil
+ @windowsNodeNameListCache = []
+ @windowsContainerRecordsCacheSizeBytes = 0
- @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
@kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
@containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB"
- @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
end
config_param :run_interval, :time, :default => 60
@@ -59,7 +67,6 @@ def initialize
def configure(conf)
super
- @inventoryToMdmConvertor = Inventory2MdmConvertor.new()
end
def start
@@ -82,10 +89,26 @@ def start
@PODS_EMIT_STREAM_BATCH_SIZE = 200
end
$log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
+
+ if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0
+ @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i
+ else
+ # this shouldnt happen just setting default here as safe guard
+ $log.warn("in_kube_podinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty")
+ @NODES_CHUNK_SIZE = 250
+ end
+ $log.info("in_kube_podinventory::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}")
+
@finished = false
@condition = ConditionVariable.new
@mutex = Mutex.new
+ @podCacheMutex = Mutex.new
+ @serviceCacheMutex = Mutex.new
+ @windowsNodeNameCacheMutex = Mutex.new
@thread = Thread.new(&method(:run_periodic))
+ @watchWinNodesThread = Thread.new(&method(:watch_windows_nodes))
+ @watchPodsThread = Thread.new(&method(:watch_pods))
+ @watchServicesThread = Thread.new(&method(:watch_services))
@@podTelemetryTimeTracker = DateTime.now.to_time.to_i
end
end
@@ -97,6 +120,9 @@ def shutdown
@condition.signal
}
@thread.join
+ @watchPodsThread.join
+ @watchServicesThread.join
+ @watchWinNodesThread.join
super # This super must be at the end of shutdown method
end
end
@@ -110,6 +136,7 @@ def enumerate(podList = nil)
@serviceCount = 0
@controllerSet = Set.new []
@winContainerCount = 0
+ @windowsContainerRecordsCacheSizeBytes = 0
@winContainerInventoryTotalSizeBytes = 0
@winContainerCountWithInventoryRecordSize64KBOrMore = 0
@winContainerCountWithEnvVarSize64KBOrMore = 0
@@ -121,6 +148,7 @@ def enumerate(podList = nil)
batchTime = currentTime.utc.iso8601
serviceRecords = []
@podInventoryE2EProcessingLatencyMs = 0
+ @mdmPodRecordItems = []
podInventoryStartTime = (Time.now.to_f * 1000).to_i
if ExtensionUtils.isAADMSIAuthMode()
$log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
@@ -146,32 +174,31 @@ def enumerate(podList = nil)
$log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
end
- # Get services first so that we dont need to make a call for very chunk
- $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}")
- serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
- # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body)
- $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}")
-
- if !serviceInfo.nil?
- $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
- serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
- $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
- serviceInfo = nil
- # service inventory records much smaller and fixed size compared to serviceList
- serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime)
- # updating for telemetry
- @serviceCount += serviceRecords.length
- serviceList = nil
- end
+ serviceInventory = {}
+ serviceItemsCacheSizeKB = 0
+ @serviceCacheMutex.synchronize {
+ serviceInventory["items"] = @serviceItemsCache.values.clone
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ serviceItemsCacheSizeKB = @serviceItemsCache.to_s.length / 1024
+ end
+ }
+ serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime)
+ # updating for telemetry
+ @serviceCount = serviceRecords.length
+ $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}")
- # to track e2e processing latency
@podsAPIE2ELatencyMs = 0
podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
# Initializing continuation token to nil
continuationToken = nil
- $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
- continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
- $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+ podItemsCacheSizeKB = 0
+ podInventory = {}
+ @podCacheMutex.synchronize {
+ podInventory["items"] = @podItemsCache.values.clone
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024
+ end
+ }
podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
@podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
@@ -180,25 +207,11 @@ def enumerate(podList = nil)
else
$log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
end
-
- #If we receive a continuation token, make calls, process and flush data until we have processed all data
- while (!continuationToken.nil? && !continuationToken.empty?)
- podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
- continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
- podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
- @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime)
- if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
- $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
- parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
- else
- $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
- end
- end
-
@podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime)
# Setting these to nil so that we dont hold memory until GC kicks in
podInventory = nil
serviceRecords = nil
+ @mdmPodRecordItems = nil
# Adding telemetry to send pod telemetry every 5 minutes
timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -213,6 +226,11 @@ def enumerate(podList = nil)
telemetryProperties["Computer"] = @@hostName
telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE
telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
+ telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
+ telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024
+ end
ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
ApplicationInsightsUtility.sendMetricTelemetry("ContainerCount", @containerCount, {})
@@ -221,7 +239,7 @@ def enumerate(podList = nil)
ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties)
if @winContainerCount > 0
telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount
- telemetryProperties["WindowsNodeCount"] = @windowsNodeCount
+ telemetryProperties["WindowsNodeCount"] = @windowsNodeNameListCache.length
telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024
telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore
if @winContainerCountWithEnvVarSize64KBOrMore > 0
@@ -257,8 +275,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
@@istestvar = ENV["ISTEST"]
begin #begin block start
- # Getting windows nodes from kubeapi
- winNodes = KubernetesApiClient.getWindowsNodesArray
podInventory["items"].each do |item| #podInventory block start
# pod inventory records
podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime)
@@ -266,40 +282,39 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
podInventoryRecords.each do |record|
if !record.nil?
eventStream.add(emitTime, record) if record
- @inventoryToMdmConvertor.process_pod_inventory_record(record)
end
end
# Setting this flag to true so that we can send ContainerInventory records for containers
# on windows nodes and parse environment variables for these containers
- if winNodes.length > 0
- nodeName = ""
- if !item["spec"]["nodeName"].nil?
- nodeName = item["spec"]["nodeName"]
+ nodeName = ""
+ if !item["spec"]["nodeName"].nil?
+ nodeName = item["spec"]["nodeName"]
+ end
+ if (!item["isWindows"].nil? && !item["isWindows"].empty? && item["isWindows"].downcase == "true")
+ clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
+ #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
+ containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true)
+ if KubernetesApiClient.isEmitCacheTelemetry()
+ @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length
end
- @windowsNodeCount = winNodes.length
- if (!nodeName.empty? && (winNodes.include? nodeName))
- clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
- #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
- containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true)
- # Send container inventory records for containers on windows nodes
- @winContainerCount += containerInventoryRecords.length
- containerInventoryRecords.each do |cirecord|
- if !cirecord.nil?
- containerInventoryStream.add(emitTime, cirecord) if cirecord
- ciRecordSize = cirecord.to_s.length
- @winContainerInventoryTotalSizeBytes += ciRecordSize
- if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
- @winContainerCountWithInventoryRecordSize64KBOrMore += 1
- end
- if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
- @winContainerCountWithEnvVarSize64KBOrMore += 1
- end
- if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
- @winContainerCountWithPortsSize64KBOrMore += 1
- end
- if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
- @winContainerCountWithCommandSize64KBOrMore += 1
- end
+ # Send container inventory records for containers on windows nodes
+ @winContainerCount += containerInventoryRecords.length
+ containerInventoryRecords.each do |cirecord|
+ if !cirecord.nil?
+ containerInventoryStream.add(emitTime, cirecord) if cirecord
+ ciRecordSize = cirecord.to_s.length
+ @winContainerInventoryTotalSizeBytes += ciRecordSize
+ if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
+ @winContainerCountWithInventoryRecordSize64KBOrMore += 1
+ end
+ if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
+ @winContainerCountWithEnvVarSize64KBOrMore += 1
+ end
+ if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
+ @winContainerCountWithPortsSize64KBOrMore += 1
+ end
+ if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY
+ @winContainerCountWithCommandSize64KBOrMore += 1
end
end
end
@@ -313,45 +328,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
router.emit_stream(@tag, eventStream) if eventStream
eventStream = Fluent::MultiEventStream.new
end
-
- #container perf records
- containerMetricDataItems = []
- containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime))
- containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime))
- containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime))
- containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime))
-
- containerMetricDataItems.each do |record|
- kubePerfEventStream.add(emitTime, record) if record
- end
-
- if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
- $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
- router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
- if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
- $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
- end
- kubePerfEventStream = Fluent::MultiEventStream.new
- end
-
- # container GPU records
- containerGPUInsightsMetricsDataItems = []
- containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime))
- containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime))
- containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime))
- containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime))
- containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
- insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
- end
-
- if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
- $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
- if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
- $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
- end
- router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
- insightsMetricsEventStream = Fluent::MultiEventStream.new
- end
end #podInventory block end
if eventStream.count > 0
@@ -372,33 +348,26 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
containerInventoryStream = nil
end
- if kubePerfEventStream.count > 0
- $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
- router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
- kubePerfEventStream = nil
- if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
- $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
- end
- end
-
- if insightsMetricsEventStream.count > 0
- $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
- router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
- if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
- $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send
+ if CustomMetricsUtils.check_custom_metrics_availability
+ begin
+ if !@mdmPodRecordItems.nil? && @mdmPodRecordItems.length > 0
+ mdmPodRecords = {
+ "collectionTime": batchTime,
+ "items": @mdmPodRecordItems,
+ }
+ mdmPodRecordsJson = mdmPodRecords.to_json
+ @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
+ @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
+ writeMDMRecords(mdmPodRecordsJson)
+ mdmPodRecords = nil
+ mdmPodRecordsJson = nil
+ @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
+ end
+ rescue => err
+ @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}"
+ end
end
- insightsMetricsEventStream = nil
- end
-
- if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
- @log.info "Sending pod inventory mdm records to out_mdm"
- pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
- @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
- mdm_pod_inventory_es = Fluent::MultiEventStream.new
- pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
- mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
- } if pod_inventory_mdm_records
- router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
end
if continuationToken.nil? # sending kube services inventory records
@@ -477,6 +446,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
record = {}
begin
+ mdmPodRecord = {}
record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated
record["Name"] = item["metadata"]["name"]
podNameSpace = item["metadata"]["namespace"]
@@ -552,7 +522,14 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
record["PodRestartCount"] = 0
#Invoke the helper method to compute ready/not ready mdm metric
- @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
+ mdmPodRecord["PodUid"] = podUid
+ mdmPodRecord["Computer"] = nodeName
+ mdmPodRecord["ControllerName"] = record["ControllerName"]
+ mdmPodRecord["Namespace"] = record["Namespace"]
+ mdmPodRecord["PodStatus"] = record["PodStatus"]
+ mdmPodRecord["PodReadyCondition"] = KubernetesApiClient.getPodReadyCondition(item["status"]["conditions"])
+ mdmPodRecord["ControllerKind"] = record["ControllerKind"]
+ mdmPodRecord["containerRecords"] = []
podContainers = []
if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty?
@@ -589,6 +566,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
record["ContainerRestartCount"] = containerRestartCount
containerStatus = container["state"]
+
+ mdmContainerRecord = {}
record["ContainerStatusReason"] = ""
# state is of the following form , so just picking up the first key name
# "state": {
@@ -613,7 +592,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
end
# Process the record to see if job was completed 6 hours ago. If so, send metric to mdm
if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB
- @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus)
+ mdmContainerRecord["state"] = containerStatus
end
end
@@ -641,7 +620,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
#Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
if lastStateReason.downcase == Constants::REASON_OOM_KILLED
- @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ mdmContainerRecord["lastState"] = container["lastState"]
end
lastStateReason = nil
else
@@ -653,7 +632,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
#Populate mdm metric for container restart count if greater than 0
if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
- @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ mdmContainerRecord["restartCount"] = containerRestartCount
+ mdmContainerRecord["lastState"] = container["lastState"]
end
rescue => errorStr
$log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}"
@@ -662,6 +642,10 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
record["ContainerLastStatus"] = Hash.new
end
+ if !mdmContainerRecord.empty?
+ mdmPodRecord["containerRecords"].push(mdmContainerRecord.dup)
+ end
+
podRestartCount += containerRestartCount
records.push(record.dup)
end
@@ -669,6 +653,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
records.push(record)
end #container status block end
+ @mdmPodRecordItems.push(mdmPodRecord.dup)
+
records.each do |record|
if !record.nil?
record["PodRestartCount"] = podRestartCount
@@ -715,5 +701,499 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords)
end
return serviceName
end
+
+ def watch_pods
+ $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ # invoke getWindowsNodes to handle scenario where windowsNodeNameCache not populated yet on containerstart
+ winNodes = KubernetesApiClient.getWindowsNodesArray()
+ if winNodes.length > 0
+ @windowsNodeNameCacheMutex.synchronize {
+ @windowsNodeNameListCache = winNodes.dup
+ }
+ end
+ loop do
+ begin
+ if podsResourceVersion.nil?
+ # clear cache before filling the cache with list
+ @podCacheMutex.synchronize {
+ @podItemsCache.clear()
+ }
+ currentWindowsNodeNameList = []
+ @windowsNodeNameCacheMutex.synchronize {
+ currentWindowsNodeNameList = @windowsNodeNameListCache.dup
+ }
+ continuationToken = nil
+ resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}"
+ $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ if (!podInventory.nil? && !podInventory.empty?)
+ podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+ if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ podInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+ isWindowsPodItem = false
+ if !nodeName.empty? &&
+ !currentWindowsNodeNameList.nil? &&
+ !currentWindowsNodeNameList.empty? &&
+ currentWindowsNodeNameList.include?(nodeName)
+ isWindowsPodItem = true
+ end
+ podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
+ end
+ while (!continuationToken.nil? && !continuationToken.empty?)
+ resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}"
+ continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ break # break, if any of the pagination call failed so that full cache will rebuild with LIST again
+ else
+ if (!podInventory.nil? && !podInventory.empty?)
+ podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+ if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+ $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ podInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+ isWindowsPodItem = false
+ if !nodeName.empty? &&
+ !currentWindowsNodeNameList.nil? &&
+ !currentWindowsNodeNameList.empty? &&
+ currentWindowsNodeNameList.include?(nodeName)
+ isWindowsPodItem = true
+ end
+ podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ end
+ end
+ if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0"
+ # https://github.com/kubernetes/kubernetes/issues/74022
+ $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil # for the LIST to happen again
+ sleep(30) # do not overwhelm the api-server if api-server down
+ else
+ begin
+ $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+ if watcher.nil?
+ $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ watcher.each do |notice|
+ case notice["type"]
+ when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+ item = notice["object"]
+ # extract latest resource version to use for watch reconnect
+ if !item.nil? && !item.empty? &&
+ !item["metadata"].nil? && !item["metadata"].empty? &&
+ !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+ podsResourceVersion = item["metadata"]["resourceVersion"]
+ # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+ break
+ end
+ if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ currentWindowsNodeNameList = []
+ @windowsNodeNameCacheMutex.synchronize {
+ currentWindowsNodeNameList = @windowsNodeNameListCache.dup
+ }
+ isWindowsPodItem = false
+ nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+ if !nodeName.empty? &&
+ !currentWindowsNodeNameList.nil? &&
+ !currentWindowsNodeNameList.empty? &&
+ currentWindowsNodeNameList.include?(nodeName)
+ isWindowsPodItem = true
+ end
+ podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
+ if !podItem.nil? && !podItem.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache[key] = podItem
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ elsif notice["type"] == "DELETED"
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ @podCacheMutex.synchronize {
+ @podItemsCache.delete(key)
+ }
+ end
+ end
+ when "ERROR"
+ podsResourceVersion = nil
+ $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+ break
+ else
+ $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+ # enforce LIST again otherwise cause inconsistency by skipping a potential RV with valid data!
+ podsResourceVersion = nil
+ break
+ end
+ end
+ $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}")
+ end
+ rescue Net::ReadTimeout => errorStr
+ ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+ # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ sleep(5) # do not overwhelm the api-server if api-server down
+ ensure
+ watcher.finish if watcher
+ end
+ end
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ podsResourceVersion = nil
+ end
+ end
+ $log.info("in_kube_podinventory::watch_pods:End @ #{Time.now.utc.iso8601}")
+ end
+
+ def watch_services
+ $log.info("in_kube_podinventory::watch_services:Start @ #{Time.now.utc.iso8601}")
+ servicesResourceVersion = nil
+ loop do
+ begin
+ if servicesResourceVersion.nil?
+ # clear cache before filling the cache with list
+ @serviceCacheMutex.synchronize {
+ @serviceItemsCache.clear()
+ }
+ $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}")
+ responseCode, serviceInfo = KubernetesApiClient.getKubeResourceInfoV2("services")
+ if responseCode.nil? || responseCode != "200"
+ $log.info("in_kube_podinventory::watch_services:Getting services from Kube API failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}")
+ if !serviceInfo.nil?
+ $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+ serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
+ $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+ serviceInfo = nil
+ if (!serviceInventory.nil? && !serviceInventory.empty?)
+ servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"]
+ if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?)
+ $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
+ serviceInventory["items"].each do |item|
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+ if !serviceItem.nil? && !serviceItem.empty?
+ @serviceCacheMutex.synchronize {
+ @serviceItemsCache[key] = serviceItem
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory @ #{Time.now.utc.iso8601}"
+ end
+ serviceInventory = nil
+ end
+ end
+ end
+ if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0"
+ # https://github.com/kubernetes/kubernetes/issues/74022
+ $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+ servicesResourceVersion = nil # for the LIST to happen again
+ sleep(30) # do not overwhelm the api-server if api-server down
+ else
+ begin
+ $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+ watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
+ if watcher.nil?
+ $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ watcher.each do |notice|
+ case notice["type"]
+ when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+ item = notice["object"]
+ # extract latest resource version to use for watch reconnect
+ if !item.nil? && !item.empty? &&
+ !item["metadata"].nil? && !item["metadata"].empty? &&
+ !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+ servicesResourceVersion = item["metadata"]["resourceVersion"]
+ # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+ servicesResourceVersion = nil
+ # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+ break
+ end
+ if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+ if !serviceItem.nil? && !serviceItem.empty?
+ @serviceCacheMutex.synchronize {
+ @serviceItemsCache[key] = serviceItem
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ elsif notice["type"] == "DELETED"
+ key = item["metadata"]["uid"]
+ if !key.nil? && !key.empty?
+ @serviceCacheMutex.synchronize {
+ @serviceItemsCache.delete(key)
+ }
+ end
+ end
+ when "ERROR"
+ servicesResourceVersion = nil
+ $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+ break
+ else
+ servicesResourceVersion = nil
+ $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+ break
+ end
+ end
+ end
+ rescue Net::ReadTimeout => errorStr
+ # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ servicesResourceVersion = nil
+ sleep(5) # do not overwhelm the api-server if api-server down
+ ensure
+ watcher.finish if watcher
+ end
+ end
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ servicesResourceVersion = nil
+ end
+ end
+ $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}")
+ end
+
+ def watch_windows_nodes
+ $log.info("in_kube_podinventory::watch_windows_nodes:Start @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ loop do
+ begin
+ if nodesResourceVersion.nil?
+ @windowsNodeNameCacheMutex.synchronize {
+ @windowsNodeNameListCache.clear()
+ }
+ continuationToken = nil
+ resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}")
+ $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+ continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+ if responseCode.nil? || responseCode != "200"
+ $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ else
+ $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}")
+ if (!nodeInventory.nil? && !nodeInventory.empty?)
+ nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+ if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+ $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ nodeInventory["items"].each do |item|
+ key = item["metadata"]["name"]
+ if !key.nil? && !key.empty?
+ @windowsNodeNameCacheMutex.synchronize {
+ if !@windowsNodeNameListCache.include?(key)
+ @windowsNodeNameListCache.push(key)
+ end
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+ end
+ while (!continuationToken.nil? && !continuationToken.empty?)
+ continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
+ if responseCode.nil? || responseCode != "200"
+ $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ break # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
+ else
+ if (!nodeInventory.nil? && !nodeInventory.empty?)
+ nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+ if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+ $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+ nodeInventory["items"].each do |item|
+ key = item["metadata"]["name"]
+ if !key.nil? && !key.empty?
+ @windowsNodeNameCacheMutex.synchronize {
+ if !@windowsNodeNameListCache.include?(key)
+ @windowsNodeNameListCache.push(key)
+ end
+ }
+ else
+ $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ else
+ $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+ end
+ end
+ end
+ end
+ end
+ if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
+ # https://github.com/kubernetes/kubernetes/issues/74022
+ $log.warn("in_kube_podinventory::watch_windows_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil # for the LIST to happen again
+ sleep(30) # do not overwhelm the api-server if api-server down
+ else
+ begin
+ $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+ if watcher.nil?
+ $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ watcher.each do |notice|
+ case notice["type"]
+ when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+ item = notice["object"]
+ # extract latest resource version to use for watch reconnect
+ if !item.nil? && !item.empty? &&
+ !item["metadata"].nil? && !item["metadata"].empty? &&
+ !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+ nodesResourceVersion = item["metadata"]["resourceVersion"]
+ # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+ else
+ $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+ break
+ end
+ if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name
+ key = item["metadata"]["name"]
+ @windowsNodeNameCacheMutex.synchronize {
+ if !@windowsNodeNameListCache.include?(key)
+ @windowsNodeNameListCache.push(key)
+ end
+ }
+ elsif notice["type"] == "DELETED"
+ key = item["metadata"]["name"]
+ @windowsNodeNameCacheMutex.synchronize {
+ @windowsNodeNameListCache.delete(key)
+ }
+ end
+ when "ERROR"
+ nodesResourceVersion = nil
+ $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+ break
+ else
+ $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+ end
+ end
+ end
+ rescue Net::ReadTimeout => errorStr
+ ## This expected if there is no activity more than readtimeout value used in the connection
+ # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ sleep(5) # do not overwhelm the api-server if api-server broken
+ ensure
+ watcher.finish if watcher
+ end
+ end
+ rescue => errorStr
+ $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+ nodesResourceVersion = nil
+ end
+ end
+ $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}")
+ end
+
+ def writeMDMRecords(mdmRecordsJson)
+ maxRetryCount = 5
+ initialRetryDelaySecs = 0.5
+ retryAttemptCount = 1
+ begin
+ f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w")
+ if !f.nil?
+ isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+ raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+ startTime = (Time.now.to_f * 1000).to_i
+ f.write(mdmRecordsJson)
+ f.flush
+ timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+ $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}"
+ else
+ raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write"
+ end
+ rescue => err
+ if retryAttemptCount <= maxRetryCount
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ sleep (initialRetryDelaySecs * retryAttemptCount)
+ retryAttemptCount = retryAttemptCount + 1
+ retry
+ end
+ $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err)
+ ensure
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ end
+ end
end # Kube_Pod_Input
end # module
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
new file mode 100644
index 000000000..bfc5227f3
--- /dev/null
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -0,0 +1,217 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "fluent/plugin/input"
+
+module Fluent::Plugin
+ require_relative "podinventory_to_mdm"
+
+ class Kube_PodMDMInventory_Input < Input
+ Fluent::Plugin.register_input("kube_podmdminventory", self)
+
+ @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
+
+ def initialize
+ super
+ require "yaml"
+ require "yajl/json_gem"
+ require "yajl"
+ require "set"
+ require "time"
+ require "net/http"
+ require "fileutils"
+ require_relative "ApplicationInsightsUtility"
+ require_relative "oms_common"
+ require_relative "omslog"
+ require_relative "constants"
+ require_relative "CustomMetricsUtils"
+ end
+
+ config_param :run_interval, :time, :default => 60
+
+ def configure(conf)
+ super
+ @inventoryToMdmConvertor = Inventory2MdmConvertor.new()
+ end
+
+ def start
+ if @run_interval
+ super
+ $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}")
+ @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability
+ @finished = false
+ @prevCollectionTime = nil
+ @condition = ConditionVariable.new
+ @mutex = Mutex.new
+ @thread = Thread.new(&method(:run_periodic))
+ end
+ end
+
+ def shutdown
+ if @run_interval
+ @mutex.synchronize {
+ @finished = true
+ @condition.signal
+ }
+ @thread.join
+ super # This super must be at the end of shutdown method
+ end
+ end
+
+ def enumerate
+ begin
+ if !@isCustomMetricsAvailability
+ $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region"
+ else
+ parse_and_emit_records()
+ end
+ rescue => errorStr
+ $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def parse_and_emit_records()
+ begin
+ $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}"
+ mdmPodRecords = getMDMRecords()
+ $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
+ if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0
+ batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory
+ mdmPodRecords["items"].each do |record|
+ @inventoryToMdmConvertor.process_pod_inventory_record(record)
+ @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
+ containerRecords = record["containerRecords"]
+ if !containerRecords.nil? && !containerRecords.empty? && containerRecords.length > 0
+ containerRecords.each do |containerRecord|
+ if !containerRecord["state"].nil? && !containerRecord["state"].empty?
+ @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
+ end
+ begin
+ if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
+ lastStateName = containerRecord["lastState"].keys[0]
+ lastStateObject = containerRecord["lastState"][lastStateName]
+ if !lastStateObject.is_a?(Hash)
+ raise "expected a hash object. This could signify a bug or a kubernetes API change"
+ end
+ if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
+ lastStateReason = lastStateObject["reason"]
+ lastFinishedTime = lastStateObject["finishedAt"]
+ #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
+ if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+ @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ end
+ lastStateReason = nil
+ end
+ end
+ containerRestartCount = containerRecord["restartCount"]
+ #Populate mdm metric for container restart count if greater than 0
+ if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+ @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+ end
+ rescue => err
+ $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
+ $log.debug_backtrace(err.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(err)
+ end
+ end
+ end
+ end
+ @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}"
+ pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
+ @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}"
+ if !pod_inventory_mdm_records.nil? && pod_inventory_mdm_records.length > 0
+ startTime = (Time.now.to_f * 1000).to_i
+ recordCount = pod_inventory_mdm_records.length
+ while recordCount > 0
+ record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)
+ time_array = Array.new(record_array.length) { batchTime }
+ mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array)
+ router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
+ pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)
+ recordCount = pod_inventory_mdm_records.length
+ time_array = nil
+ end
+ flushTimeMs = (Time.now.to_f * 1000).to_i - startTime
+ @log.info "in_kube_podmdminventory:parse_and_emit_records:timetaken to flush all Pod MDM records: #{flushTimeMs} @ #{Time.now.utc.iso8601}"
+ end
+ end
+ rescue => errorStr
+ $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+
+ def run_periodic
+ @mutex.lock
+ done = @finished
+ @nextTimeToRun = Time.now
+ @waitTimeout = @run_interval
+ until done
+ @nextTimeToRun = @nextTimeToRun + @run_interval
+ @now = Time.now
+ if @nextTimeToRun <= @now
+ @waitTimeout = 1
+ @nextTimeToRun = @now
+ else
+ @waitTimeout = @nextTimeToRun - @now
+ end
+ @condition.wait(@mutex, @waitTimeout)
+ done = @finished
+ @mutex.unlock
+ if !done
+ begin
+ $log.info("in_kube_podmdminventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}")
+ enumerate
+ $log.info("in_kube_podmdminventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}")
+ rescue => errorStr
+ $log.warn "in_kube_podmdminventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ end
+ @mutex.lock
+ end
+ @mutex.unlock
+ end
+
+ def getMDMRecords()
+ maxRetryCount = 5
+ initialRetryDelaySecs = 0.5
+ retryAttemptCount = 1
+ mdmRecords = {}
+ begin
+ f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
+ if !f.nil?
+ isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+ raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+ startTime = (Time.now.to_f * 1000).to_i
+ mdmRecords = Yajl::Parser.parse(f)
+ timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+ if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime
+ raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale"
+ end
+ @prevCollectionTime = mdmRecords["collectionTime"]
+ $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}"
+ else
+ raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read"
+ end
+ rescue => err
+ if retryAttemptCount <= maxRetryCount
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ sleep (initialRetryDelaySecs * retryAttemptCount)
+ retryAttemptCount = retryAttemptCount + 1
+ retry
+ end
+ $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}"
+ ApplicationInsightsUtility.sendExceptionTelemetry(err)
+ ensure
+ f.flock(File::LOCK_UN) if !f.nil?
+ f.close if !f.nil?
+ end
+ return mdmRecords
+ end
+ end # Kube_Pod_Input
+end # module
diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb
index 82e36c8cc..81889b61b 100644
--- a/source/plugins/ruby/kubernetes_container_inventory.rb
+++ b/source/plugins/ruby/kubernetes_container_inventory.rb
@@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
if !atLocation.nil?
containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1]
end
- end
+ end
containerInventoryRecord["ExitCode"] = 0
isContainerTerminated = false
isContainerWaiting = false
@@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
end
containerInfoMap = containersInfoMap[containerName]
- # image can be in any one of below format in spec
- # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image
+ # image can be in any one of below format in spec
+ # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image
imageValue = containerInfoMap["image"]
if !imageValue.nil? && !imageValue.empty?
# Find delimiters in image format
atLocation = imageValue.index("@")
- isDigestSpecified = false
+ isDigestSpecified = false
if !atLocation.nil?
# repository/image@digest or repository/image:imagetag@digest, image@digest
imageValue = imageValue[0..(atLocation - 1)]
# Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc.
if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty?
- containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1]
+ containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1]
end
isDigestSpecified = true
end
@@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
if !colonLocation.nil?
if slashLocation.nil?
# image:imagetag
- containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
+ containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
else
# repository/image:imagetag
containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)]
end
containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1]
- else
+ else
if slashLocation.nil?
# image
containerInventoryRecord["Image"] = imageValue
@@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
# repo/image
containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1]
- end
+ end
# if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status.
# Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names
- if isDigestSpecified == false
+ if isDigestSpecified == false
containerInventoryRecord["ImageTag"] = "latest"
end
- end
+ end
end
-
+
podName = containerInfoMap["PodName"]
namespace = containerInfoMap["Namespace"]
# containername in the format what docker sees
@@ -199,7 +199,12 @@ def getContainersInfoMap(podItem, isWindows)
cmdValue = container["command"]
cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s
containerInfoMap["Command"] = cmdValueString
- containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+ if isWindows
+ # For windows container inventory, we dont need to get envvars from pods response since its already taken care in KPI as part of pod optimized item
+ containerInfoMap["EnvironmentVar"] = container["env"]
+ else
+ containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+ end
containersInfoMap[containerName] = containerInfoMap
end
end
@@ -212,47 +217,47 @@ def getContainersInfoMap(podItem, isWindows)
return containersInfoMap
end
- def obtainContainerEnvironmentVars(containerId)
+ def obtainContainerEnvironmentVars(containerId)
envValueString = ""
begin
- isCGroupPidFetchRequired = false
+ isCGroupPidFetchRequired = false
if !@@containerCGroupCache.has_key?(containerId)
- isCGroupPidFetchRequired = true
+ isCGroupPidFetchRequired = true
else
cGroupPid = @@containerCGroupCache[containerId]
- if cGroupPid.nil? || cGroupPid.empty?
+ if cGroupPid.nil? || cGroupPid.empty?
isCGroupPidFetchRequired = true
@@containerCGroupCache.delete(containerId)
- elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
+ elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
isCGroupPidFetchRequired = true
- @@containerCGroupCache.delete(containerId)
- end
+ @@containerCGroupCache.delete(containerId)
+ end
end
- if isCGroupPidFetchRequired
+ if isCGroupPidFetchRequired
Dir["/hostfs/proc/*/cgroup"].each do |filename|
begin
if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any?
# file full path is /hostfs/proc//cgroup
- cGroupPid = filename.split("/")[3]
- if is_number?(cGroupPid)
+ cGroupPid = filename.split("/")[3]
+ if is_number?(cGroupPid)
if @@containerCGroupCache.has_key?(containerId)
- tempCGroupPid = @@containerCGroupCache[containerId]
+ tempCGroupPid = @@containerCGroupCache[containerId]
if tempCGroupPid.to_i > cGroupPid.to_i
@@containerCGroupCache[containerId] = cGroupPid
end
else
@@containerCGroupCache[containerId] = cGroupPid
- end
+ end
end
end
- rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
- end
- end
+ rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
+ end
+ end
end
cGroupPid = @@containerCGroupCache[containerId]
if !cGroupPid.nil? && !cGroupPid.empty?
- environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
+ environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
if File.exist?(environFilePath)
# Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
# Check to see if the environment variable collection is disabled for this container.
@@ -265,7 +270,7 @@ def obtainContainerEnvironmentVars(containerId)
if !envVars.nil? && !envVars.empty?
envVars = envVars.split("\0")
envValueString = envVars.to_json
- envValueStringLength = envValueString.length
+ envValueStringLength = envValueString.length
if envValueStringLength >= 200000
lastIndex = envValueString.rindex("\",")
if !lastIndex.nil?
@@ -376,6 +381,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId)
ApplicationInsightsUtility.sendExceptionTelemetry(error)
end
end
+
def is_number?(value)
true if Integer(value) rescue false
end
diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb
index e882f5ec7..4561cdd9a 100644
--- a/source/plugins/ruby/out_mdm.rb
+++ b/source/plugins/ruby/out_mdm.rb
@@ -12,6 +12,7 @@ def initialize
super
require "net/http"
require "net/https"
+ require "securerandom"
require "uri"
require "yajl/json_gem"
require_relative "KubernetesApiClient"
@@ -43,7 +44,6 @@ def initialize
@data_hash = {}
@parsed_token_uri = nil
- @http_client = nil
@token_expiry_time = Time.now
@cached_access_token = String.new
@last_post_attempt_time = Time.now
@@ -63,6 +63,7 @@ def initialize
@mdm_exceptions_hash = {}
@mdm_exceptions_count = 0
@mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
+ @proxy = nil
end
def configure(conf)
@@ -110,15 +111,7 @@ def start
end
@@post_request_url = @@post_request_url_template % { metrics_endpoint: metrics_endpoint, aks_resource_id: aks_resource_id }
@post_request_uri = URI.parse(@@post_request_url)
- proxy = (ProxyUtils.getProxyConfiguration)
- if proxy.nil? || proxy.empty?
- @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
- else
- @log.info "Proxy configured on this cluster: #{aks_resource_id}"
- @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass])
- end
-
- @http_client.use_ssl = true
+ @proxy = (ProxyUtils.getProxyConfiguration)
@log.info "POST Request url: #{@@post_request_url}"
ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {})
@@ -165,6 +158,10 @@ def start
end
end
+ def multi_workers_ready?
+ return true
+ end
+
# get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired
def get_access_token
if (Time.now > @get_access_token_backoff_expiry)
@@ -356,47 +353,56 @@ def send_to_mdm(post_body)
else
access_token = get_access_token
end
+ if @proxy.nil? || @proxy.empty?
+ http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
+ else
+ @log.info "Proxy configured on this cluster: #{aks_resource_id}"
+ http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass])
+ end
+ http_client.use_ssl = true
+ requestId = SecureRandom.uuid.to_s
request = Net::HTTP::Post.new(@post_request_uri.request_uri)
request["Content-Type"] = "application/x-ndjson"
request["Authorization"] = "Bearer #{access_token}"
+ request["x-request-id"] = requestId
request.body = post_body.join("\n")
- @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}"
- response = @http_client.request(request)
+ @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}"
+ response = http_client.request(request)
response.value # this throws for non 200 HTTP response code
- @log.info "HTTP Post Response Code : #{response.code}"
+ @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}"
if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now
ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {})
@last_telemetry_sent_time = Time.now
end
rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException
if !response.nil? && !response.body.nil? #body will have actual error
- @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}"
+ @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response.body: #{response.body}"
else
- @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}"
+ @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
end
@log.debug_backtrace(e.backtrace)
if !response.code.empty? && response.code == 403.to_s
- @log.info "Response Code #{response.code} Updating @last_post_attempt_time"
+ @log.info "Response Code #{response.code} for requestId: #{requestId} Updating @last_post_attempt_time"
@last_post_attempt_time = Time.now
@first_post_attempt_made = true
# Not raising exception, as that will cause retries to happen
elsif !response.code.empty? && response.code.start_with?("4")
# Log 400 errors and continue
- @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}"
+ @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
else
# raise if the response code is non-400
- @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
+ @log.info "HTTPServerException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
raise e
end
# Adding exceptions to hash to aggregate and send telemetry for all 400 error codes
exception_aggregator(e)
rescue Errno::ETIMEDOUT => e
- @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}"
+ @log.info "Timed out when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
raise e
rescue Exception => e
- @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}"
+ @log.info "Exception POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
raise e
end
diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb
index c24a91a87..a7f9c5435 100644
--- a/source/plugins/ruby/podinventory_to_mdm.rb
+++ b/source/plugins/ruby/podinventory_to_mdm.rb
@@ -129,7 +129,7 @@ def get_pod_inventory_mdm_records(batch_time)
controllerNameDimValue: podControllerNameDimValue,
podCountMetricValue: value,
}
- records.push(JSON.parse(record))
+ records.push(Yajl::Parser.parse(record))
}
#Add pod metric records
@@ -218,24 +218,13 @@ def process_record_for_container_restarts_metric(podControllerNameDimValue, podN
end
end
- def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podStatusConditions)
+ def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition)
if @process_incoming_stream
begin
@log.info "in process_record_for_pods_ready_metric..."
if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
podControllerNameDimValue = "No Controller"
end
- podReadyCondition = false
- if !podStatusConditions.nil? && !podStatusConditions.empty?
- podStatusConditions.each do |condition|
- if condition["type"] == "Ready"
- if condition["status"].downcase == "true"
- podReadyCondition = true
- end
- break #Exit the for loop since we found the ready condition
- end
- end
- end
MdmMetricsGenerator.generatePodReadyMetrics(podControllerNameDimValue,
podNamespaceDimValue, podReadyCondition)
rescue => errorStr