diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 53040e2f9..5b3837748 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -1,99 +1,80 @@ - #Kubernetes pod inventory - - @type kube_podinventory - tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes Persistent Volume inventory - - @type kube_pvinventory - tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes events - - @type kube_events - tag oneagent.containerInsights.KUBE_EVENTS_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes Nodes - - @type kube_nodes - tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB - run_interval 60 - @log_level debug - - - #cadvisor perf- Windows nodes - - @type win_cadvisor_perf - tag oneagent.containerInsights.LINUX_PERF_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes object state - deployments - - @type kubestate_deployments - tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB - run_interval 60 - @log_level debug - + #fluent forward plugin + + workers "#{ENV['NUM_OF_FLUENTD_WORKERS']}" + root_dir /var/opt/microsoft/docker-cimprov/state + - #Kubernetes object state - HPA - - @type kubestate_hpa - tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB - run_interval 60 - @log_level debug - + #perf + + @type forward + @id out_perf_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length "#{ENV['FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + - - @type inventory2mdm - @log_level info - - - #custom_metrics_mdm filter plugin for perf data from windows nodes + #custom_metrics_mdm filter plugin for perf data from windows nodes @type cadvisor2mdm metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes @log_level info - #kubepodinventory - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - + #containerinventory for windows containers + + @type forward + @id out_ci_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + @type file - path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + keepalive true - #kubepvinventory - + + + #Kubernetes pod inventory + + @type kube_podinventory + tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB + run_interval 60 + @log_level debug + + + #kubepodinventory + @type forward @log_level debug send_timeout 30 @@ -105,22 +86,21 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true - + keepalive true + - #InsightsMetrics - #kubestate - + #kubeservices + @type forward @log_level debug send_timeout 30 @@ -132,21 +112,30 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 2 - keepalive true + keepalive true + + + #Kubernetes Nodes + + @type kube_nodes + tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB + run_interval 60 + @log_level debug + - #kubeevents - + #containernodeinventory + @type forward @log_level debug send_timeout 30 @@ -158,21 +147,26 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer + path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 3 keepalive true - - #kubeservices - + + + @type inventory2mdm + @log_level info + + + #kubenodeinventory + @type forward @log_level debug send_timeout 30 @@ -184,47 +178,49 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 2 + flush_thread_count 5 - keepalive true - + keepalive true + - #kubenodeinventory - - @type forward + + @type mdm + @id out_mdm_nodeinventory @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - @type file - path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer + path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + retry_mdm_post_wait_minutes 30 + + + #Kubernetes events + + @type kube_events + tag oneagent.containerInsights.KUBE_EVENTS_BLOB + run_interval 60 + @log_level debug + - #containernodeinventory - + #kubeevents + @type forward @log_level debug send_timeout 30 @@ -236,47 +232,90 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 3 + flush_thread_count 5 - keepalive true + keepalive true + + + #Kubernetes podmdm inventory + + @type kube_podmdminventory + run_interval 60 + @log_level debug + - #containerinventory for windows containers - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - + + @type mdm + @id out_mdm_podinventory + @log_level debug @type file - path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count "#{ENV['FLUENTD_MDM_FLUSH_THREAD_COUNT']}" - keepalive true - + retry_mdm_post_wait_minutes 30 + + + + + #Kubernetes perf inventory + + @type kube_perfinventory + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes Persistent Volume inventory + + @type kube_pvinventory + tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB + run_interval 60 + @log_level debug + - #perf - + #cadvisor perf- Windows nodes + + @type win_cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes object state - deployments + + @type kubestate_deployments + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes object state - HPA + + @type kubestate_hpa + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + + + #kubepvinventory + @type forward @log_level debug send_timeout 30 @@ -288,51 +327,62 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/perf*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + keepalive true - - @type mdm - @log_level debug + #InsightsMetrics + #kubestate + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + @type file - path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - retry_mdm_post_wait_minutes 30 + keepalive true @type mdm + @id out_mdm_perf @log_level debug @type file path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 retry_mdm_post_wait_minutes 30 + diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 7dcbde31f..92b494ae3 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -132,6 +132,8 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root /etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root /etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_podmdminventory.rb; source/plugins/ruby/in_kube_podmdminventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_perfinventory.rb; source/plugins/ruby/in_kube_perfinventory.rb; 644; root; root /etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root /etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root /etc/fluent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root @@ -143,6 +145,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root /etc/fluent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root +/etc/fluent/plugin/WatchStream.rb; source/plugins/ruby/WatchStream.rb; 644; root; root diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index b5b239af0..ad7452aa5 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -53,6 +53,11 @@ spec: resources: {{ toYaml .Values.omsagent.resources.deployment | indent 9 }} env: + - name: NUM_OF_FLUENTD_WORKERS + valueFrom: + resourceFieldRef: + containerName: omsagent + resource: limits.cpu {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.omsagent.env.clusterId | quote }} diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 3e25fc3a4..1e00457d9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -23,8 +23,7 @@ waitforlisteneronTCPport() { if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') - while true - do + while true; do if [ $totalsleptsecs -gt $waittimesecs ]; then echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs" return 1 @@ -33,7 +32,7 @@ waitforlisteneronTCPport() { if [ -z "$varlistener" ]; then #echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..." sleep $sleepdurationsecs - totalsleptsecs=$(($totalsleptsecs+1)) + totalsleptsecs=$(($totalsleptsecs + 1)) else echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs" return 0 @@ -65,23 +64,22 @@ checkAgentOnboardingStatus() { successMessage="Loaded data sources" failureMessage="Failed to load data sources into config" fi - while true - do - if [ $totalsleptsecs -gt $waittimesecs ]; then - echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs" - return 1 - fi - - if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then - echo "Onboarding success" - return 0 - elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then - echo "Onboarding Failure: Reason: Failed to onboard the agent" - echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion" - return 1 - fi - sleep $sleepdurationsecs - totalsleptsecs=$(($totalsleptsecs+1)) + while true; do + if [ $totalsleptsecs -gt $waittimesecs ]; then + echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs" + return 1 + fi + + if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then + echo "Onboarding success" + return 0 + elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then + echo "Onboarding Failure: Reason: Failed to onboard the agent" + echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion" + return 1 + fi + sleep $sleepdurationsecs + totalsleptsecs=$(($totalsleptsecs + 1)) done else echo "${FUNCNAME[0]} called with non-numeric arguments<$2>. Required arguments <#wait-time-in-seconds>" @@ -90,6 +88,103 @@ checkAgentOnboardingStatus() { fi } +setReplicaSetSpecificConfig() { + echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20" + export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default + case $NUM_OF_FLUENTD_WORKERS in + [5-9]|9[0-9]|100) + export NUM_OF_FLUENTD_WORKERS=5 # Max is 5 core even if the specified limits more than 5 cores + export FLUENTD_POD_INVENTORY_WORKER_ID=4 + export FLUENTD_NODE_INVENTORY_WORKER_ID=3 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + export FLUENTD_FLUSH_INTERVAL="5s" + export FLUENTD_QUEUE_LIMIT_LENGTH="50" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="100" # kube perf is high volume so would need large queue limit to avoid data loss + export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker + ;; + 4) + export NUM_OF_FLUENTD_WORKERS=4 + export FLUENTD_POD_INVENTORY_WORKER_ID=3 + export FLUENTD_NODE_INVENTORY_WORKER_ID=2 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + export FLUENTD_FLUSH_INTERVAL="10s" + export FLUENTD_QUEUE_LIMIT_LENGTH="40" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="80" # kube perf is high volume so would need large queue limit + export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale + ;; + 3) + export NUM_OF_FLUENTD_WORKERS=3 + export FLUENTD_POD_INVENTORY_WORKER_ID=2 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + export FLUENTD_FLUSH_INTERVAL="15s" + export FLUENTD_QUEUE_LIMIT_LENGTH="30" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="60" # kube perf is high volume so would need large queue limit + export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale + ;; + 2) + export NUM_OF_FLUENTD_WORKERS=2 + export FLUENTD_POD_INVENTORY_WORKER_ID=1 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="40" # kube perf is high volume so would need large queue limit + export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale + ;; + + *) + export NUM_OF_FLUENTD_WORKERS=1 + export FLUENTD_POD_INVENTORY_WORKER_ID=0 + export FLUENTD_NODE_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20" + ;; + esac + echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc + echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc + echo "export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH=$FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH" >>~/.bashrc + echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc + echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc + + if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then + echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc + echo "Configured MDSD Max EPS is: ${MONITORING_MAX_EVENT_RATE}" + fi + + source ~/.bashrc + + echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" + echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" + echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" + echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" + echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" + echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" + echo "fluentd kube perf buffer plugin queue length: ${FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH}" + echo "fluentd buffer plugin queue length for all other non kube perf plugin: ${FLUENTD_QUEUE_LIMIT_LENGTH}" + echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}" +} #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -98,8 +193,8 @@ mkdir -p /var/opt/microsoft/docker-cimprov/state inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' #Run inotify as a daemon to track changes to the mounted configmap for OSM settings. -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then +if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) || + ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' fi @@ -108,58 +203,58 @@ if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" else export customResourceId=$AKS_RESOURCE_ID - echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc + echo "export customResourceId=$AKS_RESOURCE_ID" >>~/.bashrc source ~/.bashrc echo "customResourceId:$customResourceId" export customRegion=$AKS_REGION - echo "export customRegion=$AKS_REGION" >> ~/.bashrc + echo "export customRegion=$AKS_REGION" >>~/.bashrc source ~/.bashrc echo "customRegion:$customRegion" fi #set agent config schema version -if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then +if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then #trim config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" #remove all spaces config_schema_version="${config_schema_version//[[:space:]]/}" #take first 10 characters - config_schema_version="$(echo $config_schema_version| cut -c1-10)" + config_schema_version="$(echo $config_schema_version | cut -c1-10)" export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version - echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc + echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >>~/.bashrc source ~/.bashrc echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION" fi #set agent config file version -if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then +if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then #trim config_file_version="$(cat /etc/config/settings/config-version | xargs)" #remove all spaces config_file_version="${config_file_version//[[:space:]]/}" #take first 10 characters - config_file_version="$(echo $config_file_version| cut -c1-10)" + config_file_version="$(echo $config_file_version | cut -c1-10)" export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version - echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc + echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >>~/.bashrc source ~/.bashrc echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi #set OSM config schema version -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then - if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then +if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) || + ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then #trim osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" #remove all spaces osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" #take first 10 characters - osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + osm_config_schema_version="$(echo $osm_config_schema_version | cut -c1-10)" export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version - echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >>~/.bashrc source ~/.bashrc echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" fi @@ -201,13 +296,13 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then if [ -z "$host" -o -z "$port" ]; then echo "-e error proxy endpoint should be in this format http(s)://: or http(s)://:@:" else - echo "successfully validated provided proxy endpoint is valid and expected format" + echo "successfully validated provided proxy endpoint is valid and expected format" fi - echo $pwd > /opt/microsoft/docker-cimprov/proxy_password + echo $pwd >/opt/microsoft/docker-cimprov/proxy_password export MDSD_PROXY_MODE=application - echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc + echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >>~/.bashrc export MDSD_PROXY_ADDRESS=$proto$hostport echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc if [ ! -z "$user" -a ! -z "$pwd" ]; then @@ -231,8 +326,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT fi else - echo "Making curl request to oms endpint with domain: $domain" - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest + echo "Making curl request to oms endpint with domain: $domain" + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest fi if [ $? -ne 0 ]; then @@ -245,8 +340,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT` fi else - echo "Making curl request to ifconfig.co" - RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co` + echo "Making curl request to ifconfig.co" + RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co) fi if [ $RET -eq 000 ]; then echo "-e error Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster" @@ -261,8 +356,8 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT fi else - echo "ifconfig check succeeded, retrying oms endpoint..." - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest + echo "ifconfig check succeeded, retrying oms endpoint..." + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest fi if [ $? -ne 0 ]; then @@ -278,23 +373,22 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi - # Set environment variable for if public cloud by checking the workspace domain. if [ -z $domain ]; then - ClOUD_ENVIRONMENT="unknown" + ClOUD_ENVIRONMENT="unknown" elif [ $domain == "opinsights.azure.com" ]; then - CLOUD_ENVIRONMENT="azurepubliccloud" + CLOUD_ENVIRONMENT="azurepubliccloud" elif [ $domain == "opinsights.azure.cn" ]; then - CLOUD_ENVIRONMENT="azurechinacloud" + CLOUD_ENVIRONMENT="azurechinacloud" elif [ $domain == "opinsights.azure.us" ]; then - CLOUD_ENVIRONMENT="azureusgovernmentcloud" + CLOUD_ENVIRONMENT="azureusgovernmentcloud" elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then - CLOUD_ENVIRONMENT="usnat" + CLOUD_ENVIRONMENT="usnat" elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then - CLOUD_ENVIRONMENT="ussec" + CLOUD_ENVIRONMENT="ussec" fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT -echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >>~/.bashrc # Copying over CA certs for airgapped clouds. This is needed for Mariner vs Ubuntu hosts. # We are unable to tell if the host is Mariner or Ubuntu, @@ -302,7 +396,7 @@ echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc # One will have the certs and the other will be empty. # These need to be copied to a different location for Mariner vs Ubuntu containers. # OS_ID here is the container distro. -# Adding Mariner now even though the elif will never currently evaluate. +# Adding Mariner now even though the elif will never currently evaluate. if [ $CLOUD_ENVIRONMENT == "usnat" ] || [ $CLOUD_ENVIRONMENT == "ussec" ]; then OS_ID=$(cat /etc/os-release | grep ^ID= | cut -d '=' -f2 | tr -d '"' | tr -d "'") if [ $OS_ID == "mariner" ]; then @@ -322,39 +416,38 @@ fi #consisten naming conventions with the windows export DOMAIN=$domain -echo "export DOMAIN=$DOMAIN" >> ~/.bashrc +echo "export DOMAIN=$DOMAIN" >>~/.bashrc export WSID=$workspaceId -echo "export WSID=$WSID" >> ~/.bashrc +echo "export WSID=$WSID" >>~/.bashrc # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) -if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) +if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) for BACKOFF in {1..4}; do - KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL ) + KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL) # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then break else - sleep $((2**$BACKOFF / 4)) # (exponential backoff) + sleep $((2 ** $BACKOFF / 4)) # (exponential backoff) fi done # validate that the retrieved data is an instrumentation key if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) - echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >>~/.bashrc echo "Using cloud-specific instrumentation key" else # no ikey can be retrieved. Disable telemetry and continue export DISABLE_TELEMETRY=true - echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + echo "export DISABLE_TELEMETRY=true" >>~/.bashrc echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" fi fi - aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >>~/.bashrc source ~/.bashrc @@ -363,7 +456,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.7 tomlparser.rb cat config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_env_var fi @@ -399,18 +492,18 @@ fi if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then cat defaultpromenvvariables-sidecar | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables-sidecar else cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables fi else cat defaultpromenvvariables-rs | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables-rs fi @@ -418,7 +511,7 @@ fi #Sourcing environment variable file if it exists. This file has telemetry and whether kubernetes pods are monitored if [ -e "telemetry_prom_config_env_var" ]; then cat telemetry_prom_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source telemetry_prom_config_env_var fi @@ -431,20 +524,19 @@ if [ ! -e "/etc/config/kube.conf" ]; then #Sourcing config environment variable file if it exists if [ -e "side_car_fbit_config_env_var" ]; then cat side_car_fbit_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source side_car_fbit_config_env_var fi fi fi - #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.7 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_mdm_metrics_env_var @@ -452,7 +544,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.7 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_metric_collection_env_var fi @@ -464,15 +556,15 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source integration_osm_config_env_var fi fi -# If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it. -if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && - ( "${TELEMETRY_CUSTOM_PROM_MONITOR_PODS}" == "false" ) && +# If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it. +if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && + ( "${TELEMETRY_CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then setGlobalEnvVar MUTE_PROM_SIDECAR true else @@ -498,21 +590,20 @@ fi export CONTAINER_RUNTIME="containerd" export NODE_NAME="" - if [ "$cAdvisorIsSecure" = true ]; then echo "Using port 10250" export IS_SECURE_CADVISOR_PORT=true - echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc + echo "export IS_SECURE_CADVISOR_PORT=true" >>~/.bashrc export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" - echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc + echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >>~/.bashrc echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else echo "Using port 10255" export IS_SECURE_CADVISOR_PORT=false - echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc + echo "export IS_SECURE_CADVISOR_PORT=false" >>~/.bashrc export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" - echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc + echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >>~/.bashrc echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') fi @@ -524,13 +615,13 @@ if [ ! -z "$podWithValidContainerId" ]; then containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]") nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]") # use default container runtime if obtained runtime value is either empty or null - if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then + if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null" else export CONTAINER_RUNTIME=$containerRuntime fi - if [ -z "$nodeName" -o "$nodeName" == null ]; then + if [ -z "$nodeName" -o "$nodeName" == null ]; then echo "-e error nodeName in /pods API response is empty" else export NODE_NAME=$nodeName @@ -540,21 +631,21 @@ else fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME -echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc +echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >>~/.bashrc export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" -echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >>~/.bashrc export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" -echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >>~/.bashrc # default to docker metrics export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors" if [ "$CONTAINER_RUNTIME" != "docker" ]; then - # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 - export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" + # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 + export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" @@ -564,7 +655,7 @@ echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIO source ~/.bashrc -echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname +echo $NODE_NAME >/var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. cat /var/opt/microsoft/docker-cimprov/state/containerhostname @@ -577,16 +668,20 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION -echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc +if [ "${CONTROLLER_TYPE}" == "ReplicaSet" ]; then + echo "*** set applicable replicaset config ***" + setReplicaSetSpecificConfig +fi #skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc # this used by mdsd to determine cloud specific LA endpoints export OMS_TLD=$domain -echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +echo "export OMS_TLD=$OMS_TLD" >>~/.bashrc cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source /etc/mdsd.d/envmdsd MDSD_AAD_MSI_AUTH_ARGS="" @@ -650,25 +745,25 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then echo "not starting mdsd (no metrics to scrape since MUTE_PROM_SIDECAR is true)" fi else - echo "starting mdsd mode in main container..." - # add -T 0xFFFF for full traces - mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>> /dev/null & + echo "starting mdsd in main container..." + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null & fi # Set up a cron job for logrotation if [ ! -f /etc/cron.d/ci-agent ]; then - echo "setting up cronjob for ci agent log rotation" - echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" > /etc/cron.d/ci-agent + echo "setting up cronjob for ci agent log rotation" + echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" >/etc/cron.d/ci-agent fi # no dependency on fluentd for prometheus side car container if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then - echo "*** starting fluentd v1 in daemonset" - fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & + echo "*** starting fluentd v1 in daemonset" + fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else - echo "*** starting fluentd v1 in replicaset" - fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & + echo "*** starting fluentd v1 in replicaset" + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi fi @@ -699,13 +794,13 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi else if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - echo "Moving test conf file to telegraf replicaset conf since test run succeeded" - fi - echo "****************End Telegraf Run in Test Mode**************************" + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + echo "Moving test conf file to telegraf replicaset conf since test run succeeded" + fi + echo "****************End Telegraf Run in Test Mode**************************" fi fi @@ -753,15 +848,15 @@ else fi export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id -echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc +echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >>~/.bashrc export TELEMETRY_AKS_REGION=$telemetry_aks_region -echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc +echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >>~/.bashrc export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name -echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc +echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >>~/.bashrc export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name -echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc +echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >>~/.bashrc export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type -echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc +echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >>~/.bashrc #if [ ! -e "/etc/config/kube.conf" ]; then # nodename=$(cat /hostfs/etc/hostname) @@ -773,15 +868,15 @@ echo "replacing nodename in telegraf config" sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile export HOST_MOUNT_PREFIX=/hostfs -echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc +echo "export HOST_MOUNT_PREFIX=/hostfs" >>~/.bashrc export HOST_PROC=/hostfs/proc -echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc +echo "export HOST_PROC=/hostfs/proc" >>~/.bashrc export HOST_SYS=/hostfs/sys -echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc +echo "export HOST_SYS=/hostfs/sys" >>~/.bashrc export HOST_ETC=/hostfs/etc -echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc +echo "export HOST_ETC=/hostfs/etc" >>~/.bashrc export HOST_VAR=/hostfs/var -echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc +echo "export HOST_VAR=/hostfs/var" >>~/.bashrc if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then @@ -830,9 +925,10 @@ else fi shutdown() { - pkill -f mdsd - } + pkill -f mdsd +} trap "shutdown" SIGTERM -sleep inf & wait +sleep inf & +wait diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 4e021e1b8..d2d7a0c87 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -403,6 +403,8 @@ spec: # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" + - name: EMIT_CACHE_TELEMETRY + value: "false" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -661,6 +663,13 @@ spec: cpu: 150m memory: 250Mi env: + - name: NUM_OF_FLUENTD_WORKERS + valueFrom: + resourceFieldRef: + containerName: omsagent + resource: limits.cpu + - name: EMIT_CACHE_TELEMETRY + value: "false" # enable only debug or test purpose and disable for prod - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 8925248d7..ffd76bfbd 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -11,6 +11,8 @@ class KubernetesApiClient require_relative "oms_common" require_relative "constants" + require_relative "WatchStream" + require_relative "kubernetes_container_inventory" @@ApiVersion = "v1" @@ApiVersionApps = "v1" @@ -35,8 +37,6 @@ class KubernetesApiClient @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil - @@NodeMetrics = Hash.new - @@WinNodeArray = [] @@telemetryTimeTracker = DateTime.now.to_time.to_i @@resourceLimitsTelemetryHash = {} @@ -75,6 +75,39 @@ def getKubeResourceInfo(resource, api_group: nil) return response end + def getKubeResourceInfoV2(resource, api_group: nil) + headers = {} + response = nil + responseCode = nil + @Log.info "Getting Kube resource: #{resource}" + begin + resourceUri = getResourceUri(resource, api_group) + if !resourceUri.nil? + uri = URI.parse(resourceUri) + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + else + Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http| + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + responseCode = response.code + @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" + end + end + end + rescue => error + @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") + end + if (!response.nil?) + if (!response.body.nil? && response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfoV2 : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end + end + return responseCode, response + end + def getTokenStr return @@TokenStr if !@@TokenStr.nil? begin @@ -88,7 +121,7 @@ def getTokenStr end end - def getClusterRegion(env=ENV) + def getClusterRegion(env = ENV) if env["AKS_REGION"] return env["AKS_REGION"] else @@ -97,7 +130,7 @@ def getClusterRegion(env=ENV) end end - def getResourceUri(resource, api_group, env=ENV) + def getResourceUri(resource, api_group, env = ENV) begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] if api_group.nil? @@ -114,7 +147,7 @@ def getResourceUri(resource, api_group, env=ENV) end end - def getClusterName(env=ENV) + def getClusterName(env = ENV) return @@ClusterName if !@@ClusterName.nil? @@ClusterName = "None" begin @@ -148,7 +181,7 @@ def getClusterName(env=ENV) return @@ClusterName end - def getClusterId(env=ENV) + def getClusterId(env = ENV) return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId @@ -292,8 +325,6 @@ def getWindowsNodes resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" - # Resetting the windows node cache - @@WinNodeArray.clear if (!nodeInventory.empty?) nodeInventory["items"].each do |item| # check for windows operating system in node metadata @@ -303,11 +334,6 @@ def getWindowsNodes if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) - # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data - # to get images and image tags for containers in windows nodes - if !nodeMetadata.nil? && !nodeMetadata["name"].nil? - @@WinNodeArray.push(nodeMetadata["name"]) - end nodeStatusAddresses = nodeStatus["addresses"] if !nodeStatusAddresses.nil? nodeStatusAddresses.each do |address| @@ -327,7 +353,33 @@ def getWindowsNodes end def getWindowsNodesArray - return @@WinNodeArray + winNodeArray = [] + begin + # get only windows nodes + resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") + nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) + @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + winNodeArray.push(nodeMetadata["name"]) + end + end + end + end + end + rescue => error + @Log.warn("KubernetesApiClient::getWindowsNodesArray:failed with an error: #{error}") + end + return winNodeArray end def getContainerIDs(namespace) @@ -409,7 +461,7 @@ def getPodUid(podNameSpace, podMetadata) return podUid end - def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId @@ -456,19 +508,16 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricCollection = {} metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue - + metricProps["json_Collections"] = [] - metricCollections = [] - metricCollections.push(metricCollection) + metricCollections = [] + metricCollections.push(metricCollection) metricProps["json_Collections"] = metricCollections.to_json - metricItems.push(metricProps) + metricItems.push(metricProps) #No container level limit for the given metric, so default to node level limit else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - + if (metricCategory == "limits" && !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect)) + metricValue = nodeAllocatableRecord[metricNameToCollect] metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -481,10 +530,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue metricProps["json_Collections"] = [] - metricCollections = [] - metricCollections.push(metricCollection) + metricCollections = [] + metricCollections.push(metricCollection) metricProps["json_Collections"] = metricCollections.to_json - metricItems.push(metricProps) + metricItems.push(metricProps) end end end @@ -496,7 +545,7 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle return metricItems end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId @@ -541,8 +590,9 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, else #No container level limit for the given metric, so default to node level limit for non-gpu metrics if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - metricValue = @@NodeMetrics[nodeMetricsHashKey] + if !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect) + metricValue = nodeAllocatableRecord[metricNameToCollect] + end end end if (!metricValue.nil?) @@ -615,15 +665,10 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue metricCollections = [] - metricCollections.push(metricCollection) - + metricCollections.push(metricCollection) + metricItem["json_Collections"] = [] metricItem["json_Collections"] = metricCollections.to_json - - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") end rescue => error @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") @@ -657,13 +702,6 @@ def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect metricItem["Tags"] = metricTags - - #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end end rescue => error @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") @@ -754,6 +792,31 @@ def getMetricNumericValue(metricName, metricVal) return metricValue end # getMetricNumericValue + def getResourcesAndContinuationTokenV2(uri, api_group: nil) + continuationToken = nil + resourceInventory = nil + responseCode = nil + begin + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + responseCode, resourceInfo = getKubeResourceInfoV2(uri, api_group: api_group) + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + if !responseCode.nil? && responseCode == "200" && !resourceInfo.nil? + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body)) + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInfo = nil + end + if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?) + continuationToken = resourceInventory["metadata"]["continue"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getResourcesAndContinuationTokenV2:Failed in get resources for #{uri} and continuation token: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + resourceInventory = nil + end + return continuationToken, resourceInventory, responseCode + end #getResourcesAndContinuationTokenV2 + def getResourcesAndContinuationToken(uri, api_group: nil) continuationToken = nil resourceInventory = nil @@ -778,7 +841,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil) return continuationToken, resourceInventory end #getResourcesAndContinuationToken - def getKubeAPIServerUrl(env=ENV) + def getKubeAPIServerUrl(env = ENV) apiServerUrl = nil begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] @@ -818,5 +881,518 @@ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) end return kubeServiceRecords end + + # Accepts the following options: + # :namespace (string) - the namespace of the entity. + # :name (string) - the name of the entity to watch. + # :label_selector (string) - a selector to restrict the list of returned objects by labels. + # :field_selector (string) - a selector to restrict the list of returned objects by fields. + # :resource_version (string) - shows changes that occur after passed version of a resource. + # :allow_watch_bookmarks (bool) - flag to indicate whether to use bookmark or not. + def watch(resource_name, options = {}) + begin + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + end + http_options = { + use_ssl: true, + open_timeout: 60, + read_timeout: 240, # https://github.com/kubernetes-client/java/issues/1370 https://github.com/kubernetes-client/java/issues/1578 + ca_file: @@CaFile, + verify_mode: OpenSSL::SSL::VERIFY_PEER, + } + http_headers = { + Authorization: "Bearer " + getTokenStr, + } + ns = "" + if !options[:namespace].to_s.empty? + ns = "namespaces/#{namespace}/" + end + path = "watch/#{ns}#{resource_name}" + path += "/#{options[:name]}" if options[:name] + api_endpoint = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + "#{path}" + uri = URI.parse(api_endpoint) + params = {} + WATCH_ARGUMENTS.each { |k, v| params[k] = options[v] if options[v] } + uri.query = URI.encode_www_form(params) if params.any? + watcher = WatchStream.new( + uri, + http_options, + http_headers, + @Log + ) + return watcher unless block_given? + begin + watcher.each(&block) + ensure + watcher.finish if watcher + end + rescue => errorStr + @Log.warn "KubernetesApiClient::watch:Failed with an error: #{errorStr}" + end + end + + def getOptimizedItem(resource, resourceItem, isWindowsItem = false) + case resource + when "pods" + return getPodOptimizedItem(resourceItem, isWindowsItem) + when "pods-perf" + return getPodPerfOptimizedItem(resourceItem) + when "nodes" + return getNodeOptimizedItem(resourceItem) + when "services" + return getServiceOptimizedItem(resourceItem) + when "deployments" + return getDeploymentOptimizedItem(resourceItem) + when "horizontalpodautoscalers" + return getHpaOptimizedItem(resourceItem) + else + return resourceItem + end + end + + def getServiceOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["selector"] = [] + if !resourceItem["spec"]["selector"].nil? + item["spec"]["selector"] = resourceItem["spec"]["selector"] + end + item["spec"]["clusterIP"] = "" + if !resourceItem["spec"]["clusterIP"].nil? + item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"] + end + item["spec"]["type"] = "" + if !resourceItem["spec"]["type"].nil? + item["spec"]["type"] = resourceItem["spec"]["type"] + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getServiceOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def isWindowsNodeItem(nodeResourceItem) + isWindowsNodeItem = false + begin + nodeStatus = nodeResourceItem["status"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + isWindowsNodeItem = true + end + end + rescue => errorStr + $Log.warn "KubernetesApiClient::::isWindowsNodeItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" + end + return isWindowsNodeItem + end + + def getPodPerfOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + if !resourceItem["metadata"]["annotations"].nil? + item["metadata"]["annotations"] = {} + item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"] + end + + if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0 + item["metadata"]["ownerReferences"] = [] + ownerReference = {} + ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"] + ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"] + item["metadata"]["ownerReferences"].push(ownerReference) + end + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["uid"] = resourceItem["metadata"]["uid"] + end + + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["containers"] = [] + if !resourceItem["spec"]["containers"].nil? + resourceItem["spec"]["containers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + item["spec"]["containers"].push(currentContainer) + end + end + item["spec"]["initContainers"] = [] + if !resourceItem["spec"]["initContainers"].nil? + resourceItem["spec"]["initContainers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + item["spec"]["initContainers"].push(currentContainer) + end + end + item["spec"]["nodeName"] = "" + if !resourceItem["spec"]["nodeName"].nil? + item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"] + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getPodPerfOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getPodOptimizedItem(resourceItem, isWindowsPodItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + if !resourceItem["metadata"]["annotations"].nil? + item["metadata"]["annotations"] = {} + item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"] + end + if !resourceItem["metadata"]["labels"].nil? + item["metadata"]["labels"] = resourceItem["metadata"]["labels"] + end + if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0 + item["metadata"]["ownerReferences"] = [] + ownerReference = {} + ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"] + ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"] + item["metadata"]["ownerReferences"].push(ownerReference) + end + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["uid"] = resourceItem["metadata"]["uid"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + if !resourceItem["metadata"]["deletionTimestamp"].nil? + item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"] + end + end + + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["containers"] = [] + item["spec"]["initContainers"] = [] + isDisableClusterCollectEnvVar = false + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + isDisableClusterCollectEnvVar = true + end + + # container spec required only for windows container inventory records + if isWindowsPodItem + if !resourceItem["spec"]["containers"].nil? + resourceItem["spec"]["containers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPodItem + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["env"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end + end + item["spec"]["containers"].push(currentContainer) + end + end + if !resourceItem["spec"]["initContainers"].nil? + resourceItem["spec"]["initContainers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPodItem + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["env"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end + end + item["spec"]["initContainers"].push(currentContainer) + end + end + end + + item["spec"]["nodeName"] = "" + if !resourceItem["spec"]["nodeName"].nil? + item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"] + end + end + item["status"] = {} + + if !resourceItem["status"].nil? + if !resourceItem["status"]["startTime"].nil? + item["status"]["startTime"] = resourceItem["status"]["startTime"] + end + if !resourceItem["status"]["reason"].nil? + item["status"]["reason"] = resourceItem["status"]["reason"] + end + if !resourceItem["status"]["podIP"].nil? + item["status"]["podIP"] = resourceItem["status"]["podIP"] + end + if !resourceItem["status"]["phase"].nil? + item["status"]["phase"] = resourceItem["status"]["phase"] + end + if !resourceItem["status"]["conditions"].nil? + item["status"]["conditions"] = [] + resourceItem["status"]["conditions"].each do |condition| + currentCondition = {} + currentCondition["type"] = condition["type"] + currentCondition["status"] = condition["status"] + item["status"]["conditions"].push(currentCondition) + end + end + item["status"]["initContainerStatuses"] = [] + if !resourceItem["status"]["initContainerStatuses"].nil? + resourceItem["status"]["initContainerStatuses"].each do |containerStatus| + currentContainerStatus = {} + currentContainerStatus["containerID"] = containerStatus["containerID"] + currentContainerStatus["name"] = containerStatus["name"] + currentContainerStatus["restartCount"] = containerStatus["restartCount"] + currentContainerStatus["state"] = containerStatus["state"] + currentContainerStatus["lastState"] = containerStatus["lastState"] + if isWindowsPodItem + currentContainerStatus["imageID"] = containerStatus["imageID"] + end + item["status"]["initContainerStatuses"].push(currentContainerStatus) + end + end + item["status"]["containerStatuses"] = [] + if !resourceItem["status"]["containerStatuses"].nil? + resourceItem["status"]["containerStatuses"].each do |containerStatus| + currentContainerStatus = {} + currentContainerStatus["containerID"] = containerStatus["containerID"] + currentContainerStatus["name"] = containerStatus["name"] + currentContainerStatus["restartCount"] = containerStatus["restartCount"] + currentContainerStatus["state"] = containerStatus["state"] + currentContainerStatus["lastState"] = containerStatus["lastState"] + if isWindowsPodItem + currentContainerStatus["imageID"] = containerStatus["imageID"] + end + item["status"]["containerStatuses"].push(currentContainerStatus) + end + end + # this metadata used to identify the pod scheduled onto windows node + # so that pod inventory can make decision to extract containerinventory records or not + if isWindowsPodItem + item["isWindows"] = "true" + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getNodeAllocatableValues(nodeResourceItem) + nodeAllocatable = {} + begin + if !nodeResourceItem["status"].nil? && + !nodeResourceItem["status"]["allocatable"].nil? && + !nodeResourceItem["status"]["allocatable"].empty? + nodeAllocatable["cpu"] = nodeResourceItem["status"]["allocatable"]["cpu"] + nodeAllocatable["memory"] = nodeResourceItem["status"]["allocatable"]["memory"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getNodeAllocatableValues:Failed with an error : #{errorStr}" + end + return nodeAllocatable + end + + def getNodeOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + if !resourceItem["metadata"]["labels"].nil? + item["metadata"]["labels"] = resourceItem["metadata"]["labels"] + end + end + item["spec"] = {} + if !resourceItem["spec"].nil? + if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty? + provider = resourceItem["spec"]["providerID"].split(":")[0] + if !provider.nil? && !provider.empty? + item["spec"]["providerID"] = provider + end + end + end + item["status"] = {} + if !resourceItem["status"].nil? + item["status"]["conditions"] = [] + if !resourceItem["status"]["conditions"].nil? + resourceItem["status"]["conditions"].each do |condition| + currentCondition = {} + currentCondition["type"] = condition["type"] + currentCondition["status"] = condition["status"] + currentCondition["lastTransitionTime"] = condition["lastTransitionTime"] + item["status"]["conditions"].push(currentCondition) + end + end + + nodeInfo = {} + if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty? + nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"] + nodeInfo["kubeProxyVersion"] = resourceItem["status"]["nodeInfo"]["kubeProxyVersion"] + nodeInfo["osImage"] = resourceItem["status"]["nodeInfo"]["osImage"] + nodeInfo["containerRuntimeVersion"] = resourceItem["status"]["nodeInfo"]["containerRuntimeVersion"] + nodeInfo["operatingSystem"] = resourceItem["status"]["nodeInfo"]["operatingSystem"] + nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"] + end + item["status"]["nodeInfo"] = nodeInfo + + nodeAllocatable = {} + if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty? + nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"] + nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"] + if !resourceItem["status"]["allocatable"]["nvidia.com/gpu"].nil? + nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] + end + if !resourceItem["status"]["allocatable"]["amd.com/gpu"].nil? + nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + end + end + item["status"]["allocatable"] = nodeAllocatable + + nodeCapacity = {} + if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty? + nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"] + nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"] + if !resourceItem["status"]["capacity"]["nvidia.com/gpu"].nil? + nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"] + end + if !resourceItem["status"]["capacity"]["amd.com/gpu"].nil? + nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"] + end + end + item["status"]["capacity"] = nodeCapacity + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getDeploymentOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["strategy"] = {} + if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil? + item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"] + end + if !resourceItem["spec"]["replicas"].nil? + item["spec"]["replicas"] = resourceItem["spec"]["replicas"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["readyReplicas"].nil? + item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"] + end + if !resourceItem["status"]["updatedReplicas"].nil? + item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"] + end + if !resourceItem["status"]["availableReplicas"].nil? + item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"] + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getDeploymentOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getHpaOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + if !resourceItem["spec"]["minReplicas"].nil? + item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"] + end + if !resourceItem["spec"]["maxReplicas"].nil? + item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"] + end + item["spec"]["scaleTargetRef"] = {} + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil? + item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"] + end + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil? + item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["currentReplicas"].nil? + item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"] + end + if !resourceItem["status"]["desiredReplicas"].nil? + item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"] + end + if !resourceItem["status"]["lastScaleTime"].nil? + item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"] + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getHpaOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getPodReadyCondition(podStatusConditions) + podReadyCondition = false + begin + if !podStatusConditions.nil? && !podStatusConditions.empty? + podStatusConditions.each do |condition| + if condition["type"] == "Ready" + if condition["status"].downcase == "true" + podReadyCondition = true + end + break #Exit the for loop since we found the ready condition + end + end + end + rescue => err + @Log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}" + end + return podReadyCondition + end + + def isEmitCacheTelemetry + isEmitCacheTelemtryEnabled = false + if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true" + isEmitCacheTelemtryEnabled = true + end + return isEmitCacheTelemtryEnabled + end end end diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb new file mode 100644 index 000000000..6cc850450 --- /dev/null +++ b/source/plugins/ruby/WatchStream.rb @@ -0,0 +1,70 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "net/http" +require "net/https" +require "yajl/json_gem" +require "logger" +require "time" + +WATCH_ARGUMENTS = { + "labelSelector" => :label_selector, + "fieldSelector" => :field_selector, + "resourceVersion" => :resource_version, + "allowWatchBookmarks" => :allow_watch_bookmarks, + "timeoutSeconds" => :timeout_seconds, +}.freeze + +# HTTP Stream used to watch changes on entities +class WatchStream + def initialize(uri, http_options, http_headers, logger) + @uri = uri + @http_client = nil + @http_options = http_options + @http_headers = http_headers + @logger = logger + @path = "" + @logger.info "WatchStream::initialize @ #{Time.now.utc.iso8601}" + end + + def each + @finished = false + buffer = +"" + @logger.info "WatchStream::each:Opening TCP session @ #{Time.now.utc.iso8601}" + @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options) + if @http_client.nil? + raise "WatchStream::each:Failed to create HTTPClient object @ #{Time.now.utc.iso8601}" + end + @path = @uri.path + if @path.nil? || @path.empty? + raise "WatchStream::each:URI path should not be empty or nil @ #{Time.now.utc.iso8601}" + end + if !@uri.query.nil? && !@uri.query.empty? + @path += "?" + @uri.query + end + @logger.info "WatchStream::each:Making GET API call for Watch with path: #{@path} @ #{Time.now.utc.iso8601}" + @http_client.request_get(@path, @http_headers) do |response| + if !response.nil? && response.code.to_i > 300 + raise "WatchStream::each:Watch connection of the path: #{@path} failed with an http status code: #{response.code} @ #{Time.now.utc.iso8601}" + end + response.read_body do |chunk| + buffer << chunk + while (line = buffer.slice!(/.+\n/)) + yield(Yajl::Parser.parse(StringIO.new(line.chomp))) + end + end + end + rescue => e + raise e + end + + def finish + begin + @finished = true + @logger.info "WatchStream::finish:Closing HTTP session of the path:#{@path} @ #{Time.now.utc.iso8601}" + @http_client.finish if !@http_client.nil? && @http_client.started? + rescue => error + @logger.warn "WatchStream::finish:Closing of HTTP session of the path: #{@path} failed with an error: #{error} @ #{Time.now.utc.iso8601}" + end + end +end diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 542f342a6..5f57b465a 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -136,6 +136,12 @@ class Constants #To evaluate switching to Windows AMA 64KB impacts any existing customers MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 + # FileName for MDM POD Inventory state + MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" + # FileName for NodeAllocatable Records state + NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json" + # Emit Stream size for Pod MDM metric + POD_MDM_EMIT_STREAM_BATCH_SIZE = 5000 # each record is 200 bytes, 5k records ~2MB # only used in windows in AAD MSI auth mode IMDS_TOKEN_PATH_FOR_WINDOWS = "c:/etc/imds-access-token/token" end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 5a52a089b..a3cbb5a85 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -7,11 +7,12 @@ module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - def initialize(kubernetesApiClient = nil, + def initialize(is_unit_test_mode = nil, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, - telemetry_flush_interval = nil) + telemetry_flush_interval = nil, + node_items_test_cache = nil) super() require "yaml" @@ -30,6 +31,8 @@ def initialize(kubernetesApiClient = nil, @extensionUtils = extensionUtils == nil ? ExtensionUtils : extensionUtils @env = env == nil ? ENV : env @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = telemetry_flush_interval == nil ? Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES : telemetry_flush_interval + @is_unit_test_mode = is_unit_test_mode == nil ? false : true + @node_items_test_cache = node_items_test_cache # these defines were previously at class scope Moving them into the constructor so that they can be set by unit tests @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@ -63,6 +66,9 @@ def initialize(kubernetesApiClient = nil, require_relative "constants" @NodeCache = NodeStatsCache.new() + @watchNodesThread = nil + @nodeItemsCache = {} + @nodeItemsCacheSizeKB = 0 end config_param :run_interval, :time, :default => 60 @@ -96,6 +102,8 @@ def start @finished = false @condition = ConditionVariable.new @mutex = Mutex.new + @nodeCacheMutex = Mutex.new + @watchNodesThread = Thread.new(&method(:watch_nodes)) @thread = Thread.new(&method(:run_periodic)) @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -109,6 +117,7 @@ def shutdown @condition.signal } @thread.join + @watchNodesThread.join super # This super must be at the end of shutdown method end end @@ -147,43 +156,30 @@ def enumerate # Initializing continuation token to nil continuationToken = nil - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # KubernetesApiClient.getNodesResourceUri is a pure function, so call it from the actual module instead of from the mock - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = {} + @nodeItemsCacheSizeKB = 0 + nodeCount = 0 + nodeInventory["items"] = getNodeItemsFromCache() nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - nodeCount += nodeInventory["items"].length - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeCount = nodeInventory["items"].length + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeCount} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i - continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") - nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) - if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - nodeCount += nodeInventory["items"].length - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(nodeInventory, batchTime) - else - $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" - end - end - @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) - @applicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, {}) + telemetryProperties = {} + if KubernetesApiClient.isEmitCacheTelemetry() + telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in @@ -205,10 +201,19 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream = Fluent::MultiEventStream.new kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = @env["ISTEST"] + nodeAllocatableRecords = {} #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) + # node allocatble records for the kube perf plugin + nodeName = item["metadata"]["name"] + if !nodeName.nil? && !nodeName.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + nodeAllocatableRecords[nodeName] = nodeAllocatable + end + end eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") @@ -428,6 +433,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end end + if !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? + nodeAllocatableRecordsJson = nodeAllocatableRecords.to_json + if !nodeAllocatableRecordsJson.empty? + @log.info "Writing node allocatable records to state file with size(bytes): #{nodeAllocatableRecordsJson.length}" + @log.info "in_kube_nodes::parse_and_emit_records:Start:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}" + writeNodeAllocatableRecords(nodeAllocatableRecordsJson) + @log.info "in_kube_nodes::parse_and_emit_records:End:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}" + end + nodeAllocatableRecordsJson = nil + nodeAllocatableRecords = nil + end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -577,6 +593,211 @@ def getNodeTelemetryProps(item) end return properties end + + def watch_nodes + if !@is_unit_test_mode + $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + # clear cache before filling the cache with list + @nodeCacheMutex.synchronize { + @nodeItemsCache.clear() + } + continuationToken = nil + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again + break + else + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + end + end + end + end + if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache.delete(key) + } + end + end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + break + end + end + end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + end + end + $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}") + end + end + + def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) + maxRetryCount = 5 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + begin + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + f.write(nodeAllocatbleRecordsJson) + f.flush + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}" + else + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write" + end + rescue => err + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + retryAttemptCount = retryAttemptCount + 1 + sleep (initialRetryDelaySecs * retryAttemptCount) + retry + end + $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + end + end + + def getNodeItemsFromCache() + nodeItems = {} + if @is_unit_test_mode + nodeItems = @node_items_test_cache + else + @nodeCacheMutex.synchronize { + nodeItems = @nodeItemsCache.values.clone + if KubernetesApiClient.isEmitCacheTelemetry() + @nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 + end + } + end + return nodeItems + end end # Kube_Node_Input class NodeStatsCache diff --git a/source/plugins/ruby/in_kube_nodes_test.rb b/source/plugins/ruby/in_kube_nodes_test.rb index 8f4984c6c..7d55ea32d 100644 --- a/source/plugins/ruby/in_kube_nodes_test.rb +++ b/source/plugins/ruby/in_kube_nodes_test.rb @@ -1,10 +1,10 @@ -require 'minitest/autorun' +require "minitest/autorun" -require 'fluent/test' -require 'fluent/test/driver/input' -require 'fluent/test/helpers' +require "fluent/test" +require "fluent/test/driver/input" +require "fluent/test/helpers" -require_relative 'in_kube_nodes.rb' +require_relative "in_kube_nodes.rb" class InKubeNodesTests < Minitest::Test include Fluent::Test::Helpers @@ -13,20 +13,22 @@ def setup Fluent::Test.setup end - def create_driver(conf = {}, kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil, telemetry_flush_interval=nil) - Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(kubernetesApiClient=kubernetesApiClient, - applicationInsightsUtility=applicationInsightsUtility, - extensionUtils=extensionUtils, - env=env)).configure(conf) + def create_driver(conf = {}, is_unit_test_mode = true, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, telemetry_flush_interval = nil, node_items_test_cache) + Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(is_unit_test_mode, kubernetesApiClient = kubernetesApiClient, + applicationInsightsUtility = applicationInsightsUtility, + extensionUtils = extensionUtils, + env = env, + telemetry_flush_interval, + node_items_test_cache)).configure(conf) end # Collection time of scrapped data will always be different. Overwrite it in any records returned by in_kube_ndes.rb def overwrite_collection_time(data) if data.key?("CollectionTime") - data["CollectionTime"] = "~CollectionTime~" + data["CollectionTime"] = "~CollectionTime~" end if data.key?("Timestamp") - data["Timestamp"] = "~Timestamp~" + data["Timestamp"] = "~Timestamp~" end return data end @@ -45,41 +47,46 @@ def test_basic_single_node # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking # but it doesn't track how many times isAADMSIAuthMode is called def extensionUtils.isAADMSIAuthMode - false + false end nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes.txt").read) - kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + node_items_test_cache = nodes_api_response["items"] + kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterId, "/cluster-id") + def appInsightsUtil.sendExceptionTelemetry(exception) + if exception.to_s != "undefined method `[]' for nil:NilClass" + raise "an unexpected exception has occured" + end + end config = "run_interval 999999999" # only run once - d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env) + d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, node_items_test_cache) d.instance.start d.instance.enumerate d.run(timeout: 99999) # Input plugins decide when to run, so we have to give it enough time to run - - expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, - ["mdm.kubenodeinventory", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]"})] => true} + expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true, + ["mdm.kubenodeinventory", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]" })] => true } d.events.each do |tag, time, record| - cleaned_record = overwrite_collection_time record - if expected_responses.key?([tag, cleaned_record]) - expected_responses[[tag, cleaned_record]] = true - else - assert(false, "got unexpected record") - end + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + else + assert(false, "got unexpected record: #{cleaned_record}") + end end expected_responses.each do |key, val| - assert(val, "expected record not emitted: #{key}") + assert(val, "expected record not emitted: #{key}") end # make sure all mocked methods were called the expected number of times @@ -104,7 +111,7 @@ def test_malformed_node_spec # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking # but it doesn't track how many times isAADMSIAuthMode is called def extensionUtils.isAADMSIAuthMode - false + false end # Set up the KubernetesApiClient Mock. Note: most of the functions in KubernetesApiClient are pure (access no @@ -112,16 +119,17 @@ def extensionUtils.isAADMSIAuthMode # more brittle). Instead, in_kube_nodes bypasses the mock and directly calls these functions in KubernetesApiClient. # Ideally the pure functions in KubernetesApiClient would be refactored into their own file to reduce confusion. nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes-malformed.txt").read) - kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + node_items_test_cache = nodes_api_response["items"] + kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterId, "/cluster-id") kubeApiClient.expect(:getClusterId, "/cluster-id") def appInsightsUtil.sendExceptionTelemetry(exception) - if exception.to_s != "undefined method `[]' for nil:NilClass" - raise "an unexpected exception has occured" - end + if exception.to_s != "undefined method `[]' for nil:NilClass" + raise "an unexpected exception has occured" + end end # This test doesn't care if metric telemetry is sent properly. Looking for an unnecessary value would make it needlessly rigid @@ -130,38 +138,38 @@ def appInsightsUtil.sendMetricTelemetry(a, b, c) config = "run_interval 999999999" # only run once - d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env, telemetry_flush_interval=0) + d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, telemetry_flush_interval = 0, node_items_test_cache) d.instance.start d.instance.enumerate d.run(timeout: 99999) #TODO: is this necessary? expected_responses = { - ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]"}] => false, - - # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records) - ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]" }] => false, + + # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records) + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false, } d.events.each do |tag, time, record| - cleaned_record = overwrite_collection_time record - if expected_responses.key?([tag, cleaned_record]) - expected_responses[[tag, cleaned_record]] = true - end - # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data. - # we care more that the non-malformed data is still emitted + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + end + # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data. + # we care more that the non-malformed data is still emitted end expected_responses.each do |key, val| - assert(val, "expected record not emitted: #{key}") + assert(val, "expected record not emitted: #{key}") end kubeApiClient.verify diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb new file mode 100644 index 000000000..ad8fdbf21 --- /dev/null +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -0,0 +1,433 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "fluent/plugin/input" + +module Fluent::Plugin + class Kube_PerfInventory_Input < Input + Fluent::Plugin.register_input("kube_perfinventory", self) + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "set" + require "time" + require "net/http" + + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + require_relative "extension_utils" + + # refer tomlparser-agent-config for updating defaults + # this configurable via configmap + @PODS_CHUNK_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 0 + + @watchPodsThread = nil + @podItemsCache = {} + + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" + + def configure(conf) + super + end + + def start + if @run_interval + super + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") + @PODS_CHUNK_SIZE = 1000 + end + $log.info("in_kube_perfinventory::start: PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") + @PODS_EMIT_STREAM_BATCH_SIZE = 200 + end + $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @podCacheMutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @watchPodsThread = Thread.new(&method(:watch_pods)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + @watchPodsThread.join + super # This super must be at the end of shutdown method + end + end + + def enumerate(podList = nil) + begin + podInventory = podList + @podCount = 0 + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_perfinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_kube_perfinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + end + + nodeAllocatableRecords = getNodeAllocatableRecords() + # Initializing continuation token to nil + continuationToken = nil + podItemsCacheSizeKB = 0 + podInventory = {} + @podCacheMutex.synchronize { + podInventory["items"] = @podItemsCache.values.clone + } + if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime) + else + $log.warn "in_kube_perfinventory::enumerate:Received empty podInventory" + end + # Setting these to nil so that we dont hold memory until GC kicks in + podInventory = nil + nodeAllocatableRecords = nil + rescue => errorStr + $log.warn "in_kube_perfinventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = Fluent::Engine.now + kubePerfEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new + @@istestvar = ENV["ISTEST"] + + begin #begin block start + podInventory["items"].each do |item| #podInventory block start + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + + nodeAllocatableRecord = {} + if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) + nodeAllocatableRecord = nodeAllocatableRecords[nodeName] + end + #container perf records + containerMetricDataItems = [] + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) + + containerMetricDataItems.each do |record| + kubePerfEventStream.add(emitTime, record) if record + end + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_perfinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + kubePerfEventStream = Fluent::MultiEventStream.new + end + + # container GPU records + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord + end + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_perfinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new + end + end #podInventory block end + + if kubePerfEventStream.count > 0 + $log.info("in_kube_perfinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_perfinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + insightsMetricsEventStream = nil + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record kube perf inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end #begin block end + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_perfinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_perfinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_perfinventory::run_periodic: enumerate Failed to retrieve perf inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + def watch_pods + $log.info("in_kube_perfinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + loop do + begin + if podsResourceVersion.nil? + # clear cache before filling the cache with list + @podCacheMutex.synchronize { + @podItemsCache.clear() + } + continuationToken = nil + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}" + $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API:#{resourceUri} @ #{Time.now.utc.iso8601}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}" + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + break # break, if any of the pagination call failed so that full cache will rebuild with LIST again + else + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + end + end + end + end + end + if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_perfinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @podCacheMutex.synchronize { + @podItemsCache.delete(key) + } + end + end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + podsResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}") + end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + end + end + $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}") + end + + def getNodeAllocatableRecords() + maxRetryCount = 5 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + nodeAllocatableRecords = {} + begin + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + nodeAllocatableRecords = Yajl::Parser.parse(f) + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + else + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read" + end + rescue => err + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 + retry + end + $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + end + return nodeAllocatableRecords + end + end # Kube_Pod_Input +end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index f979ef7c5..bdbc465ec 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -4,12 +4,9 @@ require "fluent/plugin/input" module Fluent::Plugin - require_relative "podinventory_to_mdm" - class Kube_PodInventory_Input < Input Fluent::Plugin.register_input("kube_podinventory", self) - @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) def initialize @@ -19,6 +16,8 @@ def initialize require "yajl" require "set" require "time" + require "net/http" + require "fileutils" require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" @@ -27,11 +26,13 @@ def initialize require_relative "omslog" require_relative "constants" require_relative "extension_utils" + require_relative "CustomMetricsUtils" # refer tomlparser-agent-config for updating defaults # this configurable via configmap @PODS_CHUNK_SIZE = 0 @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @NODES_CHUNK_SIZE = 0 @podCount = 0 @containerCount = 0 @@ -47,11 +48,18 @@ def initialize @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 + @watchPodsThread = nil + @podItemsCache = {} + + @watchServicesThread = nil + @serviceItemsCache = {} + + @watchWinNodesThread = nil + @windowsNodeNameListCache = [] + @windowsContainerRecordsCacheSizeBytes = 0 - @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -59,7 +67,6 @@ def initialize def configure(conf) super - @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end def start @@ -82,10 +89,26 @@ def start @PODS_EMIT_STREAM_BATCH_SIZE = 200 end $log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 + end + $log.info("in_kube_podinventory::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new + @podCacheMutex = Mutex.new + @serviceCacheMutex = Mutex.new + @windowsNodeNameCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @watchWinNodesThread = Thread.new(&method(:watch_windows_nodes)) + @watchPodsThread = Thread.new(&method(:watch_pods)) + @watchServicesThread = Thread.new(&method(:watch_services)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -97,6 +120,9 @@ def shutdown @condition.signal } @thread.join + @watchPodsThread.join + @watchServicesThread.join + @watchWinNodesThread.join super # This super must be at the end of shutdown method end end @@ -110,6 +136,7 @@ def enumerate(podList = nil) @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @windowsContainerRecordsCacheSizeBytes = 0 @winContainerInventoryTotalSizeBytes = 0 @winContainerCountWithInventoryRecordSize64KBOrMore = 0 @winContainerCountWithEnvVarSize64KBOrMore = 0 @@ -121,6 +148,7 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 + @mdmPodRecordItems = [] podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") @@ -146,32 +174,31 @@ def enumerate(podList = nil) $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end - # Get services first so that we dont need to make a call for very chunk - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - - if !serviceInfo.nil? - $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceInfo = nil - # service inventory records much smaller and fixed size compared to serviceList - serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) - # updating for telemetry - @serviceCount += serviceRecords.length - serviceList = nil - end + serviceInventory = {} + serviceItemsCacheSizeKB = 0 + @serviceCacheMutex.synchronize { + serviceInventory["items"] = @serviceItemsCache.values.clone + if KubernetesApiClient.isEmitCacheTelemetry() + serviceItemsCacheSizeKB = @serviceItemsCache.to_s.length / 1024 + end + } + serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime) + # updating for telemetry + @serviceCount = serviceRecords.length + $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}") - # to track e2e processing latency @podsAPIE2ELatencyMs = 0 podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podItemsCacheSizeKB = 0 + podInventory = {} + @podCacheMutex.synchronize { + podInventory["items"] = @podItemsCache.values.clone + if KubernetesApiClient.isEmitCacheTelemetry() + podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024 + end + } podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) @@ -180,25 +207,11 @@ def enumerate(podList = nil) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") - podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) - if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) - else - $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" - end - end - @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil + @mdmPodRecordItems = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -213,6 +226,11 @@ def enumerate(podList = nil) telemetryProperties["Computer"] = @@hostName telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE + if KubernetesApiClient.isEmitCacheTelemetry() + telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB + telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB + telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024 + end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("ContainerCount", @containerCount, {}) @@ -221,7 +239,7 @@ def enumerate(podList = nil) ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount - telemetryProperties["WindowsNodeCount"] = @windowsNodeCount + telemetryProperties["WindowsNodeCount"] = @windowsNodeNameListCache.length telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024 telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore if @winContainerCountWithEnvVarSize64KBOrMore > 0 @@ -257,8 +275,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc @@istestvar = ENV["ISTEST"] begin #begin block start - # Getting windows nodes from kubeapi - winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |item| #podInventory block start # pod inventory records podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) @@ -266,40 +282,39 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords.each do |record| if !record.nil? eventStream.add(emitTime, record) if record - @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers # on windows nodes and parse environment variables for these containers - if winNodes.length > 0 - nodeName = "" - if !item["spec"]["nodeName"].nil? - nodeName = item["spec"]["nodeName"] + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + if (!item["isWindows"].nil? && !item["isWindows"].empty? && item["isWindows"].downcase == "true") + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel + containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) + if KubernetesApiClient.isEmitCacheTelemetry() + @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length end - @windowsNodeCount = winNodes.length - if (!nodeName.empty? && (winNodes.include? nodeName)) - clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] - #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) - # Send container inventory records for containers on windows nodes - @winContainerCount += containerInventoryRecords.length - containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - containerInventoryStream.add(emitTime, cirecord) if cirecord - ciRecordSize = cirecord.to_s.length - @winContainerInventoryTotalSizeBytes += ciRecordSize - if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY - @winContainerCountWithInventoryRecordSize64KBOrMore += 1 - end - if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY - @winContainerCountWithEnvVarSize64KBOrMore += 1 - end - if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY - @winContainerCountWithPortsSize64KBOrMore += 1 - end - if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY - @winContainerCountWithCommandSize64KBOrMore += 1 - end + # Send container inventory records for containers on windows nodes + @winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + containerInventoryStream.add(emitTime, cirecord) if cirecord + ciRecordSize = cirecord.to_s.length + @winContainerInventoryTotalSizeBytes += ciRecordSize + if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithInventoryRecordSize64KBOrMore += 1 + end + if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithEnvVarSize64KBOrMore += 1 + end + if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithPortsSize64KBOrMore += 1 + end + if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithCommandSize64KBOrMore += 1 end end end @@ -313,45 +328,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc router.emit_stream(@tag, eventStream) if eventStream eventStream = Fluent::MultiEventStream.new end - - #container perf records - containerMetricDataItems = [] - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - - containerMetricDataItems.each do |record| - kubePerfEventStream.add(emitTime, record) if record - end - - if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - kubePerfEventStream = Fluent::MultiEventStream.new - end - - # container GPU records - containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord - end - - if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = Fluent::MultiEventStream.new - end end #podInventory block end if eventStream.count > 0 @@ -372,33 +348,26 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerInventoryStream = nil end - if kubePerfEventStream.count > 0 - $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - kubePerfEventStream = nil - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end - - if insightsMetricsEventStream.count > 0 - $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send + if CustomMetricsUtils.check_custom_metrics_availability + begin + if !@mdmPodRecordItems.nil? && @mdmPodRecordItems.length > 0 + mdmPodRecords = { + "collectionTime": batchTime, + "items": @mdmPodRecordItems, + } + mdmPodRecordsJson = mdmPodRecords.to_json + @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" + @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" + writeMDMRecords(mdmPodRecordsJson) + mdmPodRecords = nil + mdmPodRecordsJson = nil + @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" + end + rescue => err + @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}" + end end - insightsMetricsEventStream = nil - end - - if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send - @log.info "Sending pod inventory mdm records to out_mdm" - pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = Fluent::MultiEventStream.new - pod_inventory_mdm_records.each { |pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end if continuationToken.nil? # sending kube services inventory records @@ -477,6 +446,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record = {} begin + mdmPodRecord = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Name"] = item["metadata"]["name"] podNameSpace = item["metadata"]["namespace"] @@ -552,7 +522,14 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["PodRestartCount"] = 0 #Invoke the helper method to compute ready/not ready mdm metric - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) + mdmPodRecord["PodUid"] = podUid + mdmPodRecord["Computer"] = nodeName + mdmPodRecord["ControllerName"] = record["ControllerName"] + mdmPodRecord["Namespace"] = record["Namespace"] + mdmPodRecord["PodStatus"] = record["PodStatus"] + mdmPodRecord["PodReadyCondition"] = KubernetesApiClient.getPodReadyCondition(item["status"]["conditions"]) + mdmPodRecord["ControllerKind"] = record["ControllerKind"] + mdmPodRecord["containerRecords"] = [] podContainers = [] if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? @@ -589,6 +566,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["ContainerRestartCount"] = containerRestartCount containerStatus = container["state"] + + mdmContainerRecord = {} record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { @@ -613,7 +592,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) end # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) + mdmContainerRecord["state"] = containerStatus end end @@ -641,7 +620,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + mdmContainerRecord["lastState"] = container["lastState"] end lastStateReason = nil else @@ -653,7 +632,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Populate mdm metric for container restart count if greater than 0 if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + mdmContainerRecord["restartCount"] = containerRestartCount + mdmContainerRecord["lastState"] = container["lastState"] end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" @@ -662,6 +642,10 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["ContainerLastStatus"] = Hash.new end + if !mdmContainerRecord.empty? + mdmPodRecord["containerRecords"].push(mdmContainerRecord.dup) + end + podRestartCount += containerRestartCount records.push(record.dup) end @@ -669,6 +653,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) records.push(record) end #container status block end + @mdmPodRecordItems.push(mdmPodRecord.dup) + records.each do |record| if !record.nil? record["PodRestartCount"] = podRestartCount @@ -715,5 +701,499 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) end return serviceName end + + def watch_pods + $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # invoke getWindowsNodes to handle scenario where windowsNodeNameCache not populated yet on containerstart + winNodes = KubernetesApiClient.getWindowsNodesArray() + if winNodes.length > 0 + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache = winNodes.dup + } + end + loop do + begin + if podsResourceVersion.nil? + # clear cache before filling the cache with list + @podCacheMutex.synchronize { + @podItemsCache.clear() + } + currentWindowsNodeNameList = [] + @windowsNodeNameCacheMutex.synchronize { + currentWindowsNodeNameList = @windowsNodeNameListCache.dup + } + continuationToken = nil + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}" + $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + isWindowsPodItem = false + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}" + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + break # break, if any of the pagination call failed so that full cache will rebuild with LIST again + else + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + isWindowsPodItem = false + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + end + end + end + end + end + if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server down + else + begin + $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + currentWindowsNodeNameList = [] + @windowsNodeNameCacheMutex.synchronize { + currentWindowsNodeNameList = @windowsNodeNameListCache.dup + } + isWindowsPodItem = false + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @podCacheMutex.synchronize { + @podItemsCache.delete(key) + } + end + end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + # enforce LIST again otherwise cause inconsistency by skipping a potential RV with valid data! + podsResourceVersion = nil + break + end + end + $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}") + end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server down + ensure + watcher.finish if watcher + end + end + rescue => errorStr + $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + end + end + $log.info("in_kube_podinventory::watch_pods:End @ #{Time.now.utc.iso8601}") + end + + def watch_services + $log.info("in_kube_podinventory::watch_services:Start @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + loop do + begin + if servicesResourceVersion.nil? + # clear cache before filling the cache with list + @serviceCacheMutex.synchronize { + @serviceItemsCache.clear() + } + $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}") + responseCode, serviceInfo = KubernetesApiClient.getKubeResourceInfoV2("services") + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_services:Getting services from Kube API failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}") + if !serviceInfo.nil? + $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) + $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInfo = nil + if (!serviceInventory.nil? && !serviceInventory.empty?) + servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"] + if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") + serviceInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory @ #{Time.now.utc.iso8601}" + end + serviceInventory = nil + end + end + end + if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server down + else + begin + $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + servicesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache.delete(key) + } + end + end + when "ERROR" + servicesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + servicesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + break + end + end + end + rescue Net::ReadTimeout => errorStr + # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server down + ensure + watcher.finish if watcher + end + end + rescue => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + end + end + $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}") + end + + def watch_windows_nodes + $log.info("in_kube_podinventory::watch_windows_nodes:Start @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.clear() + } + continuationToken = nil + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}") + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + break # break, if any of the pagination call failed so that full cache can be rebuild with LIST again + else + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + end + end + end + end + if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_windows_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server down + else + begin + $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name + key = item["metadata"]["name"] + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + elsif notice["type"] == "DELETED" + key = item["metadata"]["name"] + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.delete(key) + } + end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + rescue => errorStr + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + end + end + $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}") + end + + def writeMDMRecords(mdmRecordsJson) + maxRetryCount = 5 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + begin + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + f.write(mdmRecordsJson) + f.flush + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}" + else + raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write" + end + rescue => err + if retryAttemptCount <= maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 + retry + end + $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb new file mode 100644 index 000000000..bfc5227f3 --- /dev/null +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -0,0 +1,217 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "fluent/plugin/input" + +module Fluent::Plugin + require_relative "podinventory_to_mdm" + + class Kube_PodMDMInventory_Input < Input + Fluent::Plugin.register_input("kube_podmdminventory", self) + + @@MDMKubePodInventoryTag = "mdm.kubepodinventory" + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "set" + require "time" + require "net/http" + require "fileutils" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + require_relative "CustomMetricsUtils" + end + + config_param :run_interval, :time, :default => 60 + + def configure(conf) + super + @inventoryToMdmConvertor = Inventory2MdmConvertor.new() + end + + def start + if @run_interval + super + $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}") + @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability + @finished = false + @prevCollectionTime = nil + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + super # This super must be at the end of shutdown method + end + end + + def enumerate + begin + if !@isCustomMetricsAvailability + $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region" + else + parse_and_emit_records() + end + rescue => errorStr + $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def parse_and_emit_records() + begin + $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}" + mdmPodRecords = getMDMRecords() + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" + if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0 + batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory + mdmPodRecords["items"].each do |record| + @inventoryToMdmConvertor.process_pod_inventory_record(record) + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) + containerRecords = record["containerRecords"] + if !containerRecords.nil? && !containerRecords.empty? && containerRecords.length > 0 + containerRecords.each do |containerRecord| + if !containerRecord["state"].nil? && !containerRecord["state"].empty? + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) + end + begin + if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 + lastStateName = containerRecord["lastState"].keys[0] + lastStateObject = containerRecord["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + lastStateReason = lastStateObject["reason"] + lastFinishedTime = lastStateObject["finishedAt"] + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil + end + end + containerRestartCount = containerRecord["restartCount"] + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => err + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" + $log.debug_backtrace(err.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(err) + end + end + end + end + @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}" + if !pod_inventory_mdm_records.nil? && pod_inventory_mdm_records.length > 0 + startTime = (Time.now.to_f * 1000).to_i + recordCount = pod_inventory_mdm_records.length + while recordCount > 0 + record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) + time_array = Array.new(record_array.length) { batchTime } + mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array) + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) + pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) + recordCount = pod_inventory_mdm_records.length + time_array = nil + end + flushTimeMs = (Time.now.to_f * 1000).to_i - startTime + @log.info "in_kube_podmdminventory:parse_and_emit_records:timetaken to flush all Pod MDM records: #{flushTimeMs} @ #{Time.now.utc.iso8601}" + end + end + rescue => errorStr + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_podmdminventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_podmdminventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_podmdminventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + def getMDMRecords() + maxRetryCount = 5 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + mdmRecords = {} + begin + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + mdmRecords = Yajl::Parser.parse(f) + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime + raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale" + end + @prevCollectionTime = mdmRecords["collectionTime"] + $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + else + raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" + end + rescue => err + if retryAttemptCount <= maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 + retry + end + $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + end + return mdmRecords + end + end # Kube_Pod_Input +end # module diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 82e36c8cc..81889b61b 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] - # image can be in any one of below format in spec - # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image imageValue = containerInfoMap["image"] if !imageValue.nil? && !imageValue.empty? # Find delimiters in image format atLocation = imageValue.index("@") - isDigestSpecified = false + isDigestSpecified = false if !atLocation.nil? # repository/image@digest or repository/image:imagetag@digest, image@digest imageValue = imageValue[0..(atLocation - 1)] # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? - containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] end isDigestSpecified = true end @@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !colonLocation.nil? if slashLocation.nil? # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] else # repository/image:imagetag containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] end containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - else + else if slashLocation.nil? # image containerInventoryRecord["Image"] = imageValue @@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa # repo/image containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] - end + end # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names - if isDigestSpecified == false + if isDigestSpecified == false containerInventoryRecord["ImageTag"] = "latest" end - end + end end - + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -199,7 +199,12 @@ def getContainersInfoMap(podItem, isWindows) cmdValue = container["command"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s containerInfoMap["Command"] = cmdValueString - containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + if isWindows + # For windows container inventory, we dont need to get envvars from pods response since its already taken care in KPI as part of pod optimized item + containerInfoMap["EnvironmentVar"] = container["env"] + else + containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + end containersInfoMap[containerName] = containerInfoMap end end @@ -212,47 +217,47 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin - isCGroupPidFetchRequired = false + isCGroupPidFetchRequired = false if !@@containerCGroupCache.has_key?(containerId) - isCGroupPidFetchRequired = true + isCGroupPidFetchRequired = true else cGroupPid = @@containerCGroupCache[containerId] - if cGroupPid.nil? || cGroupPid.empty? + if cGroupPid.nil? || cGroupPid.empty? isCGroupPidFetchRequired = true @@containerCGroupCache.delete(containerId) - elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") isCGroupPidFetchRequired = true - @@containerCGroupCache.delete(containerId) - end + @@containerCGroupCache.delete(containerId) + end end - if isCGroupPidFetchRequired + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if is_number?(cGroupPid) + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] + tempCGroupPid = @@containerCGroupCache[containerId] if tempCGroupPid.to_i > cGroupPid.to_i @@containerCGroupCache[containerId] = cGroupPid end else @@containerCGroupCache[containerId] = cGroupPid - end + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -265,7 +270,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? @@ -376,6 +381,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end + def is_number?(value) true if Integer(value) rescue false end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index e882f5ec7..4561cdd9a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -12,6 +12,7 @@ def initialize super require "net/http" require "net/https" + require "securerandom" require "uri" require "yajl/json_gem" require_relative "KubernetesApiClient" @@ -43,7 +44,6 @@ def initialize @data_hash = {} @parsed_token_uri = nil - @http_client = nil @token_expiry_time = Time.now @cached_access_token = String.new @last_post_attempt_time = Time.now @@ -63,6 +63,7 @@ def initialize @mdm_exceptions_hash = {} @mdm_exceptions_count = 0 @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i + @proxy = nil end def configure(conf) @@ -110,15 +111,7 @@ def start end @@post_request_url = @@post_request_url_template % { metrics_endpoint: metrics_endpoint, aks_resource_id: aks_resource_id } @post_request_uri = URI.parse(@@post_request_url) - proxy = (ProxyUtils.getProxyConfiguration) - if proxy.nil? || proxy.empty? - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) - else - @log.info "Proxy configured on this cluster: #{aks_resource_id}" - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass]) - end - - @http_client.use_ssl = true + @proxy = (ProxyUtils.getProxyConfiguration) @log.info "POST Request url: #{@@post_request_url}" ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) @@ -165,6 +158,10 @@ def start end end + def multi_workers_ready? + return true + end + # get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired def get_access_token if (Time.now > @get_access_token_backoff_expiry) @@ -356,47 +353,56 @@ def send_to_mdm(post_body) else access_token = get_access_token end + if @proxy.nil? || @proxy.empty? + http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + else + @log.info "Proxy configured on this cluster: #{aks_resource_id}" + http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) + end + http_client.use_ssl = true + requestId = SecureRandom.uuid.to_s request = Net::HTTP::Post.new(@post_request_uri.request_uri) request["Content-Type"] = "application/x-ndjson" request["Authorization"] = "Bearer #{access_token}" + request["x-request-id"] = requestId request.body = post_body.join("\n") - @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" - response = @http_client.request(request) + @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}" + response = http_client.request(request) response.value # this throws for non 200 HTTP response code - @log.info "HTTP Post Response Code : #{response.code}" + @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error - @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" + @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response.body: #{response.body}" else - @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" + @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" end @log.debug_backtrace(e.backtrace) if !response.code.empty? && response.code == 403.to_s - @log.info "Response Code #{response.code} Updating @last_post_attempt_time" + @log.info "Response Code #{response.code} for requestId: #{requestId} Updating @last_post_attempt_time" @last_post_attempt_time = Time.now @first_post_attempt_made = true # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue - @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" else # raise if the response code is non-400 - @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "HTTPServerException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" raise e end # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes exception_aggregator(e) rescue Errno::ETIMEDOUT => e - @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}" + @log.info "Timed out when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) raise e rescue Exception => e - @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}" + @log.info "Exception POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) raise e end diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index c24a91a87..a7f9c5435 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -129,7 +129,7 @@ def get_pod_inventory_mdm_records(batch_time) controllerNameDimValue: podControllerNameDimValue, podCountMetricValue: value, } - records.push(JSON.parse(record)) + records.push(Yajl::Parser.parse(record)) } #Add pod metric records @@ -218,24 +218,13 @@ def process_record_for_container_restarts_metric(podControllerNameDimValue, podN end end - def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podStatusConditions) + def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition) if @process_incoming_stream begin @log.info "in process_record_for_pods_ready_metric..." if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = "No Controller" end - podReadyCondition = false - if !podStatusConditions.nil? && !podStatusConditions.empty? - podStatusConditions.each do |condition| - if condition["type"] == "Ready" - if condition["status"].downcase == "true" - podReadyCondition = true - end - break #Exit the for loop since we found the ready condition - end - end - end MdmMetricsGenerator.generatePodReadyMetrics(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition) rescue => errorStr