From 95f5b6522855643152678e0cb0bd4207f6291ca9 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 3 Jan 2022 11:11:40 -0800
Subject: [PATCH 01/65] watch and multiproc implementation

---
 build/linux/installer/conf/kube.conf          | 396 ++++++++++--------
 .../installer/datafiles/base_container.data   |   3 +-
 kubernetes/omsagent.yaml                      |   5 +-
 source/plugins/ruby/KubernetesApiClient.rb    | 385 ++++++++++++++++-
 source/plugins/ruby/WatchStream.rb            |  63 +++
 source/plugins/ruby/in_kube_nodes.rb          | 288 ++++++++-----
 source/plugins/ruby/in_kube_podinventory.rb   | 305 +++++++++++---
 .../ruby/kubernetes_container_inventory.rb    |  63 +--
 8 files changed, 1121 insertions(+), 387 deletions(-)
 create mode 100644 source/plugins/ruby/WatchStream.rb
diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index a1c8bf928..1340a27a4 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -1,92 +1,78 @@
-    #fluent forward plugin
-    <source>
-     @type forward
-     port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
-     bind 0.0.0.0
-     chunk_size_limit 4m
-    </source>
+   #fluent forward plugin
+   <system>
+    workers 2
+    root_dir /var/opt/microsoft/docker-cimprov/state
+   </system>
 
-    #Kubernetes pod inventory
-    <source>
-     @type kube_podinventory
-     tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
+   #perf
+   <match **LINUX_PERF_BLOB**>
+    @type forward
+    @id out_perf_fwd
+    @log_level debug
+    send_timeout 30
+    connect_timeout 30
+    heartbeat_type none
+    <server>
+      host 0.0.0.0
+      port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+    </server>
+    <buffer>
+     @type file
+     overflow_action drop_oldest_chunk
+     chunk_limit_size 4m
+     queue_limit_length 20
+     flush_interval 20s
+     retry_max_times 10
+     retry_wait 5s
+     retry_max_interval 5m
+     flush_thread_count 5
+    </buffer>
+    keepalive true
+   </match>
 
-    #Kubernetes Persistent Volume inventory
-    <source>
-     @type kube_pvinventory
-     tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
-
-    #Kubernetes events
-    <source>
-     @type kube_events
-     tag oneagent.containerInsights.KUBE_EVENTS_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
-
-    #Kubernetes Nodes
-    <source>
-     @type kube_nodes
-     tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
-
-    #Kubernetes health
-    <source>
-     @type kube_health
-     tag kubehealth.ReplicaSet
-     run_interval 60
-     @log_level debug
-    </source>
+   #custom_metrics_mdm filter plugin for perf data from windows nodes
+    <filter mdm.cadvisorperf**>
+     @type cadvisor2mdm
+     metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
+     @log_level info
+    </filter>
 
-    #cadvisor perf- Windows nodes
-    <source>
-     @type win_cadvisor_perf
-     tag oneagent.containerInsights.LINUX_PERF_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
+    #containerinventory for windows containers
+    <match **CONTAINER_INVENTORY_BLOB**>
+      @type forward
+      @id out_ci_fwd
+      @log_level debug
+      send_timeout 30
+      connect_timeout 30
+      heartbeat_type none
+      <server>
+        host 0.0.0.0
+        port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+      </server>
+     <buffer>
+      @type file
+      overflow_action drop_oldest_chunk
+      chunk_limit_size 4m
+      queue_limit_length 20
+      flush_interval 20s
+      retry_max_times 10
+      retry_wait 5s
+      retry_max_interval 5m
+      flush_thread_count 5
+     </buffer>
+     keepalive true
+    </match>
 
-    #Kubernetes object state - deployments
-    <source>
-     @type kubestate_deployments
-     tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
 
-    #Kubernetes object state - HPA
+  <worker 1>
+    #Kubernetes pod inventory
     <source>
-     @type kubestate_hpa
-     tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
+     @type kube_podinventory
+     tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB
      run_interval 60
      @log_level debug
     </source>
 
-    <filter mdm.kubenodeinventory**>
-     @type inventory2mdm
-     @log_level info
-    </filter>
-
-    #custom_metrics_mdm filter plugin for perf data from windows nodes
-    <filter mdm.cadvisorperf**>
-     @type cadvisor2mdm
-     metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes
-     @log_level info
-    </filter>
-
-    #health model aggregation filter
-    <filter kubehealth**>
-     @type health_model_builder
-    </filter>
-
     #kubepodinventory
     <match **KUBE_POD_INVENTORY_BLOB**>
      @type forward
@@ -108,13 +94,13 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
-     keepalive true       
+     keepalive true
     </match>
 
-    #kubepvinventory    
-    <match **KUBE_PV_INVENTORY_BLOB**>
+    #kubeservices
+    <match **KUBE_SERVICES_BLOB**>
      @type forward
      @log_level debug
      send_timeout 30
@@ -126,7 +112,7 @@
      </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -134,26 +120,18 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 2
      </buffer>
-     keepalive true       
-    </match>   
+     keepalive true
+    </match>
 
-    #InsightsMetrics
-    #kubestate
-    <match **INSIGHTS_METRICS_BLOB**>
-     @type forward
+    <match mdm.kubepodinventory**>
+     @type mdm
+     @id out_mdm_podinventory
      @log_level debug
-     send_timeout 30
-     connect_timeout 30
-     heartbeat_type none
-     <server>
-       host 0.0.0.0
-       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-     </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -161,13 +139,21 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
-     keepalive true       
+     retry_mdm_post_wait_minutes 30
     </match>
 
-    #kubeevents
-    <match **KUBE_EVENTS_BLOB**>
+    #Kubernetes Nodes
+    <source>
+     @type kube_nodes
+     tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #containernodeinventory
+    <match **CONTAINER_NODE_INVENTORY_BLOB**>
      @type forward
      @log_level debug
      send_timeout 30
@@ -179,7 +165,7 @@
      </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -187,13 +173,18 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 3
      </buffer>
      keepalive true
     </match>
-    
-    #kubeservices
-    <match **KUBE_SERVICES_BLOB**>
+
+    <filter mdm.kubenodeinventory**>
+     @type inventory2mdm
+     @log_level info
+    </filter>
+
+    #kubenodeinventory
+    <match **KUBE_NODE_INVENTORY_BLOB**>
      @type forward
      @log_level debug
      send_timeout 30
@@ -205,7 +196,7 @@
      </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -213,25 +204,18 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 2   
+      flush_thread_count 5
      </buffer>
-     keepalive true     
-    </match> 
+     keepalive true
+    </match>
 
-    #kubenodeinventory
-    <match **KUBE_NODE_INVENTORY_BLOB**>
-     @type forward
+    <match mdm.kubenodeinventory** >
+     @type mdm
+     @id out_mdm_nodeinventory
      @log_level debug
-     send_timeout 30
-     connect_timeout 30
-     heartbeat_type none
-     <server>
-       host 0.0.0.0
-       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-     </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -239,13 +223,76 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
-     keepalive true     
+     retry_mdm_post_wait_minutes 30
     </match>
+  </worker>
 
-    #containernodeinventory
-    <match **CONTAINER_NODE_INVENTORY_BLOB**>
+  <worker 0>
+    #fluent forward plugin
+    <source>
+     @type forward
+     port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
+     bind 0.0.0.0
+     chunk_size_limit 4m
+    </source>
+
+    #Kubernetes Persistent Volume inventory
+    <source>
+     @type kube_pvinventory
+     tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #Kubernetes events
+    <source>
+     @type kube_events
+     tag oneagent.containerInsights.KUBE_EVENTS_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #Kubernetes health
+    <source>
+     @type kube_health
+     tag kubehealth.ReplicaSet
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #cadvisor perf- Windows nodes
+    <source>
+     @type win_cadvisor_perf
+     tag oneagent.containerInsights.LINUX_PERF_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #Kubernetes object state - deployments
+    <source>
+     @type kubestate_deployments
+     tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #Kubernetes object state - HPA
+    <source>
+     @type kubestate_hpa
+     tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #health model aggregation filter
+    <filter kubehealth**>
+     @type health_model_builder
+    </filter>
+
+    #kubepvinventory
+    <match **KUBE_PV_INVENTORY_BLOB**>
      @type forward
      @log_level debug
      send_timeout 30
@@ -257,7 +304,7 @@
      </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -265,25 +312,26 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 3   
+      flush_thread_count 5
      </buffer>
-     keepalive true      
+     keepalive true
     </match>
 
-    #containerinventory for windows containers
-    <match **CONTAINER_INVENTORY_BLOB**>
-      @type forward
-      @log_level debug
-      send_timeout 30
-      connect_timeout 30
-      heartbeat_type none
-      <server>
-        host 0.0.0.0
-        port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-      </server>
+    #InsightsMetrics
+    #kubestate
+    <match **INSIGHTS_METRICS_BLOB**>
+     @type forward
+     @log_level debug
+     send_timeout 30
+     connect_timeout 30
+     heartbeat_type none
+     <server>
+       host 0.0.0.0
+       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+     </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -291,13 +339,13 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
      keepalive true
-    </match>  
+    </match>
 
-    #perf   
-    <match **LINUX_PERF_BLOB**>
+    #kubeevents
+    <match **KUBE_EVENTS_BLOB**>
      @type forward
      @log_level debug
      send_timeout 30
@@ -309,7 +357,7 @@
      </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/perf*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -317,17 +365,25 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
-     keepalive true     
+     keepalive true
     </match>
 
-    <match mdm.kubepodinventory** mdm.kubenodeinventory** >
-     @type mdm
-     @log_level debug    
+    #kubehealth
+    <match **KUBE_HEALTH_BLOB**>
+     @type forward
+     @log_level debug
+     send_timeout 30
+     connect_timeout 30
+     heartbeat_type none
+     <server>
+       host 0.0.0.0
+       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+     </server>
      <buffer>
       @type file
-      path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer
+      path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
@@ -335,13 +391,14 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
-     retry_mdm_post_wait_minutes 30
+     keepalive true
     </match>
 
     <match mdm.cadvisorperf**>
      @type mdm
+     @id out_mdm_perf
      @log_level debug
      <buffer>
       @type file
@@ -353,33 +410,8 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5   
+      flush_thread_count 5
      </buffer>
      retry_mdm_post_wait_minutes 30
     </match>
- 
-    #kubehealth
-    <match **KUBE_HEALTH_BLOB**>
-     @type forward
-     @log_level debug
-     send_timeout 30
-     connect_timeout 30
-     heartbeat_type none
-     <server>
-       host 0.0.0.0
-       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-     </server>
-     <buffer>
-      @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer
-      overflow_action drop_oldest_chunk
-      chunk_limit_size 4m
-      queue_limit_length 20
-      flush_interval 20s
-      retry_max_times 10
-      retry_wait 5s
-      retry_max_interval 5m
-      flush_thread_count 5   
-     </buffer>
-     keepalive true     
-    </match>
+  </worker>
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index d104a5084..a405e760f 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -178,6 +178,7 @@ MAINTAINER:              'Microsoft Corporation'
 
 /etc/fluent/plugin/out_health_forward.rb;			                                        source/plugins/ruby/out_health_forward.rb;	644; root; root
 /etc/fluent/plugin/out_mdm.rb;			                                                 source/plugins/ruby/out_mdm.rb;	644; root; root
+/etc/fluent/plugin/WatchStream.rb;                                                      source/plugins/ruby/WatchStream.rb;	644; root; root
 
 
 
@@ -309,7 +310,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then
    rmdir /etc/opt/microsoft/docker-cimprov/conf 2> /dev/null
    rmdir /etc/opt/microsoft/docker-cimprov 2> /dev/null
    rmdir /etc/opt/microsoft 2> /dev/null
-   rmdir /etc/opt 2> /dev/null  
+   rmdir /etc/opt 2> /dev/null
 fi
 
 %Preinstall_0
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index a1a843196..2ff9c5249 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -26,7 +26,7 @@ rules:
     verbs: ["list", "get", "watch"]
   - apiGroups: ["apps", "extensions", "autoscaling"]
     resources: ["replicasets", "deployments", "horizontalpodautoscalers"]
-    verbs: ["list"]
+    verbs: ["list", "watch"]
   - apiGroups: ["azmon.container.insights"]
     resources: ["healthstates"]
     verbs: ["get", "create", "patch"]
@@ -607,7 +607,7 @@ spec:
           imagePullPolicy: IfNotPresent
           resources:
             limits:
-              cpu: 1
+              cpu: 2
               memory: 1Gi
             requests:
               cpu: 150m
@@ -927,4 +927,3 @@ spec:
   names:
     plural: healthstates
     kind: HealthState
-    
\ No newline at end of file
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 8925248d7..319129cae 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -11,6 +11,8 @@ class KubernetesApiClient
 
   require_relative "oms_common"
   require_relative "constants"
+  require_relative "WatchStream"
+  require_relative "kubernetes_container_inventory"
 
   @@ApiVersion = "v1"
   @@ApiVersionApps = "v1"
@@ -88,7 +90,7 @@ def getTokenStr
       end
     end
 
-    def getClusterRegion(env=ENV)
+    def getClusterRegion(env = ENV)
       if env["AKS_REGION"]
         return env["AKS_REGION"]
       else
@@ -97,7 +99,7 @@ def getClusterRegion(env=ENV)
       end
     end
 
-    def getResourceUri(resource, api_group, env=ENV)
+    def getResourceUri(resource, api_group, env = ENV)
       begin
         if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"]
           if api_group.nil?
@@ -114,7 +116,7 @@ def getResourceUri(resource, api_group, env=ENV)
       end
     end
 
-    def getClusterName(env=ENV)
+    def getClusterName(env = ENV)
       return @@ClusterName if !@@ClusterName.nil?
       @@ClusterName = "None"
       begin
@@ -148,7 +150,7 @@ def getClusterName(env=ENV)
       return @@ClusterName
     end
 
-    def getClusterId(env=ENV)
+    def getClusterId(env = ENV)
       return @@ClusterId if !@@ClusterId.nil?
       #By default initialize ClusterId to ClusterName.
       #<TODO> In ACS/On-prem, we need to figure out how we can generate ClusterId
@@ -456,19 +458,19 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               metricCollection = {}
               metricCollection["CounterName"] = metricNametoReturn
               metricCollection["Value"] = metricValue
-              
+
               metricProps["json_Collections"] = []
-              metricCollections = []               
-              metricCollections.push(metricCollection)        
+              metricCollections = []
+              metricCollections.push(metricCollection)
               metricProps["json_Collections"] = metricCollections.to_json
-              metricItems.push(metricProps)             
+              metricItems.push(metricProps)
               #No container level limit for the given metric, so default to node level limit
             else
               nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
               if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey))
                 metricValue = @@NodeMetrics[nodeMetricsHashKey]
                 #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ")
-                               
+
                 metricProps = {}
                 metricProps["Timestamp"] = metricTime
                 metricProps["Host"] = nodeName
@@ -481,10 +483,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
                 metricCollection["CounterName"] = metricNametoReturn
                 metricCollection["Value"] = metricValue
                 metricProps["json_Collections"] = []
-                metricCollections = []                  
-                metricCollections.push(metricCollection)        
+                metricCollections = []
+                metricCollections.push(metricCollection)
                 metricProps["json_Collections"] = metricCollections.to_json
-                metricItems.push(metricProps)              
+                metricItems.push(metricProps)
               end
             end
           end
@@ -615,11 +617,11 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri
           metricCollection["CounterName"] = metricNametoReturn
           metricCollection["Value"] = metricValue
           metricCollections = []
-          metricCollections.push(metricCollection) 
-         
+          metricCollections.push(metricCollection)
+
           metricItem["json_Collections"] = []
           metricItem["json_Collections"] = metricCollections.to_json
-         
+
           #push node level metrics to a inmem hash so that we can use it looking up at container level.
           #Currently if container level cpu & memory limits are not defined we default to node level limits
           @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
@@ -778,7 +780,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil)
       return continuationToken, resourceInventory
     end #getResourcesAndContinuationToken
 
-    def getKubeAPIServerUrl(env=ENV)
+    def getKubeAPIServerUrl(env = ENV)
       apiServerUrl = nil
       begin
         if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"]
@@ -818,5 +820,356 @@ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601)
       end
       return kubeServiceRecords
     end
+
+    # Accepts the following options:
+    #   :namespace (string) - the namespace of the entity.
+    #   :name (string) - the name of the entity to watch.
+    #   :label_selector (string) - a selector to restrict the list of returned objects by labels.
+    #   :field_selector (string) - a selector to restrict the list of returned objects by fields.
+    #   :resource_version (string) - shows changes that occur after passed version of a resource.
+    #   :allow_watch_bookmarks (bool) - flag to indicate whether to use bookmark or not.
+    def watch(resource_name, options = {})
+      begin
+        if !File.exist?(@@CaFile)
+          raise "#{@@CaFile} doesnt exist"
+        end
+        http_options = {
+          use_ssl: true,
+          open_timeout: 60,
+          read_timeout: 240, # https://github.com/kubernetes-client/java/issues/1370 https://github.com/kubernetes-client/java/issues/1578
+          ca_file: @@CaFile,
+          verify_mode: OpenSSL::SSL::VERIFY_PEER,
+        }
+        http_headers = {
+          Authorization: "Bearer " + getTokenStr,
+        }
+        ns = ""
+        if !options[:namespace].to_s.empty?
+          ns = "namespaces/#{namespace}/"
+        end
+        path = "watch/#{ns}#{resource_name}"
+        path += "/#{options[:name]}" if options[:name]
+        api_endpoint = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + "#{path}"
+        uri = URI.parse(api_endpoint)
+        params = {}
+        WATCH_ARGUMENTS.each { |k, v| params[k] = options[v] if options[v] }
+        uri.query = URI.encode_www_form(params) if params.any?
+        watcher = WatchStream.new(
+          uri,
+          http_options,
+          http_headers,
+          @Log
+        )
+        return watcher unless block_given?
+        begin
+          watcher.each(&block)
+        ensure
+          watcher.finish if watcher
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::watch:Failed with an error : #{errorStr}"
+      end
+    end
+
+    def getOptimizedItem(resource, resourceItem, winNodes = [])
+      case resource
+      when "pods"
+        return getPodOptimizedItem(resourceItem, winNodes)
+      when "nodes"
+        return getNodeOptimizedItem(resourceItem)
+      when "services"
+        return getServiceOptimizedItem(resourceItem)
+      when "deployments"
+        return getDeploymentOptimizedItem(resourceItem)
+      when "horizontalpodautoscalers"
+        return getHpaOptimizedItem(resourceItem)
+      else
+        return resourceItem
+      end
+    end
+
+    def getServiceOptimizedItem(resourceItem)
+      item = {}
+      item["metadata"] = {}
+      if !resourceItem["metadata"].nil?
+        item["metadata"]["name"] = resourceItem["metadata"]["name"]
+        item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+      end
+      item["spec"] = {}
+      if !resourceItem["spec"].nil?
+        item["spec"]["selector"] = []
+        if !resourceItem["spec"]["selector"].nil?
+          item["spec"]["selector"] = resourceItem["spec"]["selector"]
+        end
+        item["spec"]["clusterIP"] = ""
+        if !resourceItem["spec"]["clusterIP"].nil?
+          item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"]
+        end
+        item["spec"]["type"] = ""
+        if !resourceItem["spec"]["type"].nil?
+          item["spec"]["type"] = resourceItem["spec"]["type"]
+        end
+      end
+      return item
+    end
+
+    def isWindowsPodItem(podItem, winNodes)
+      isWindowsPod = false
+      if !winNodes.nil? && !winNodes.empty?
+        nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
+        if !nodeName.empty? && winNodes.include?(nodeName)
+          isWindowsPod = true
+        end
+      end
+      return isWindowsPod
+    end
+
+    def getPodOptimizedItem(resourceItem, winNodes)
+      item = {}
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          if !resourceItem["metadata"]["annotations"].nil?
+            item["metadata"]["annotations"] = resourceItem["metadata"]["annotations"]
+          end
+          if !resourceItem["metadata"]["labels"].nil?
+            item["metadata"]["labels"] = resourceItem["metadata"]["labels"]
+          end
+          if !resourceItem["metadata"]["ownerReferences"].nil?
+            item["metadata"]["ownerReferences"] = resourceItem["metadata"]["ownerReferences"]
+          end
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+          item["metadata"]["resourceVersion"] = resourceItem["metadata"]["resourceVersion"]
+          item["metadata"]["uid"] = resourceItem["metadata"]["uid"]
+          item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+          if !resourceItem["metadata"]["deletionTimestamp"].nil?
+            item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"]
+          end
+        end
+        isWindowsPod = isWindowsPodItem(resourceItem, winNodes)
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          item["spec"]["containers"] = []
+          isDisableClusterCollectEnvVar = false
+          clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
+          if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0
+            isDisableClusterCollectEnvVar = true
+          end
+          if !resourceItem["spec"]["containers"].nil?
+            resourceItem["spec"]["containers"].each do |container|
+              currentContainer = {}
+              currentContainer["name"] = container["name"]
+              currentContainer["resources"] = container["resources"]
+              # fields required for windows containers records
+              if isWindowsPod
+                currentContainer["image"] = container["image"]
+                currentContainer["ports"] = container["ports"]
+                currentContainer["command"] = container["command"]
+                currentContainer["EnvironmentVar"] = ""
+                if !isDisableClusterCollectEnvVar
+                  currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+                end
+              end
+              item["spec"]["containers"].push(currentContainer)
+            end
+          end
+          item["spec"]["initContainers"] = []
+          if !resourceItem["spec"]["initContainers"].nil?
+            resourceItem["spec"]["initContainers"].each do |container|
+              currentContainer = {}
+              currentContainer["name"] = container["name"]
+              currentContainer["resources"] = container["resources"]
+              # fields required for windows containers records
+              if isWindowsPod
+                currentContainer["image"] = container["image"]
+                currentContainer["ports"] = container["ports"]
+                currentContainer["command"] = container["command"]
+                currentContainer["EnvironmentVar"] = ""
+                if !isDisableClusterCollectEnvVar
+                  currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+                end
+              end
+              item["spec"]["initContainers"].push(currentContainer)
+            end
+          end
+          item["spec"]["nodeName"] = ""
+          if !resourceItem["spec"]["nodeName"].nil?
+            item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"]
+          end
+        end
+        item["status"] = {}
+        if !resourceItem["status"].nil?
+          item["status"] = resourceItem["status"]
+          if !resourceItem["status"]["startTime"].nil?
+            item["status"]["startTime"] = resourceItem["status"]["startTime"]
+          end
+          if !resourceItem["status"]["reason"].nil?
+            item["status"]["reason"] = resourceItem["status"]["reason"]
+          end
+          if !resourceItem["status"]["podIP"].nil?
+            item["status"]["podIP"] = resourceItem["status"]["podIP"]
+          end
+          if !resourceItem["status"]["phase"].nil?
+            item["status"]["phase"] = resourceItem["status"]["phase"]
+          end
+          item["status"]["conditions"] = []
+          if !resourceItem["status"]["conditions"].nil?
+            resourceItem["status"]["conditions"].each do |condition|
+              currentCondition = {}
+              currentCondition["type"] = condition["type"]
+              currentCondition["status"] = condition["status"]
+              ## TODO - check if we need this
+              currentCondition["lastTransitionTime"] = condition["lastTransitionTime"]
+              item["status"]["conditions"].push(currentCondition)
+            end
+          end
+          item["status"]["initContainerStatuses"] = []
+          if !resourceItem["status"]["initContainerStatuses"].nil?
+            resourceItem["status"]["initContainerStatuses"].each do |containerStatus|
+              currentContainerStatus = {}
+              currentContainerStatus["containerID"] = containerStatus["containerID"]
+              currentContainerStatus["name"] = containerStatus["name"]
+              currentContainerStatus["restartCount"] = containerStatus["restartCount"]
+              currentContainerStatus["state"] = containerStatus["state"]
+              currentContainerStatus["lastState"] = containerStatus["lastState"]
+              if isWindowsPod
+                currentContainerStatus["imageID"] = containerStatus["imageID"]
+              end
+              item["status"]["initContainerStatuses"].push(currentContainerStatus)
+            end
+          end
+          item["status"]["containerStatuses"] = []
+          if !resourceItem["status"]["containerStatuses"].nil?
+            resourceItem["status"]["containerStatuses"].each do |containerStatus|
+              currentContainerStatus = {}
+              currentContainerStatus["containerID"] = containerStatus["containerID"]
+              currentContainerStatus["name"] = containerStatus["name"]
+              currentContainerStatus["restartCount"] = containerStatus["restartCount"]
+              currentContainerStatus["state"] = containerStatus["state"]
+              currentContainerStatus["lastState"] = containerStatus["lastState"]
+              if isWindowsPod
+                currentContainerStatus["imageID"] = containerStatus["imageID"]
+              end
+              item["status"]["containerStatuses"].push(currentContainerStatus)
+            end
+          end
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}"
+      end
+      return item
+    end
+
+    def getNodeOptimizedItem(resourceItem)
+      item = {}
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+          if !resourceItem["metadata"]["labels"].nil?
+            item["metadata"]["labels"] = resourceItem["metadata"]["labels"]
+          end
+        end
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty?
+            item["spec"]["providerID"] = resourceItem["spec"]["providerID"]
+          end
+        end
+        item["status"] = {}
+        if !resourceItem["status"].nil?
+          item["status"]["conditions"] = resourceItem["status"]["conditions"]
+          item["status"]["nodeInfo"] = {}
+          nodeInfo = {}
+          if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty?
+            nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"]
+            nodeInfo["kubeProxyVersion"] = resourceItem["status"]["nodeInfo"]["kubeProxyVersion"]
+            nodeInfo["osImage"] = resourceItem["status"]["nodeInfo"]["osImage"]
+            nodeInfo["containerRuntimeVersion"] = resourceItem["status"]["nodeInfo"]["containerRuntimeVersion"]
+            nodeInfo["operatingSystem"] = resourceItem["status"]["nodeInfo"]["operatingSystem"]
+            nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"]
+          end
+          item["status"]["nodeInfo"] = nodeInfo
+          item["status"]["allocatable"] = resourceItem["status"]["allocatable"]
+          item["status"]["capacity"] = resourceItem["status"]["capacity"]
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}"
+      end
+      return item
+    end
+
+    def getDeploymentOptimizedItem(resourceItem)
+      item = {}
+      item["metadata"] = {}
+      if !resourceItem["metadata"].nil?
+        item["metadata"]["name"] = resourceItem["metadata"]["name"]
+        item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+      end
+      item["spec"] = {}
+      if !resourceItem["spec"].nil?
+        item["spec"]["strategy"] = {}
+        if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil?
+          item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"]
+        end
+        if !resourceItem["spec"]["replicas"].nil?
+          item["spec"]["replicas"] = resourceItem["spec"]["replicas"]
+        end
+      end
+      item["status"] = {}
+      if !resourceItem["status"].nil?
+        if !resourceItem["status"]["readyReplicas"].nil?
+          item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"]
+        end
+        if !resourceItem["status"]["updatedReplicas"].nil?
+          item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"]
+        end
+        if !resourceItem["status"]["availableReplicas"].nil?
+          item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"]
+        end
+      end
+      return item
+    end
+
+    def getHpaOptimizedItem(resourceItem)
+      item = {}
+      item["metadata"] = {}
+      if !resourceItem["metadata"].nil?
+        item["metadata"]["name"] = resourceItem["metadata"]["name"]
+        item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+        item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
+      end
+      item["spec"] = {}
+      if !resourceItem["spec"].nil?
+        if !resourceItem["spec"]["minReplicas"].nil?
+          item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"]
+        end
+        if !resourceItem["spec"]["maxReplicas"].nil?
+          item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"]
+        end
+        item["spec"]["scaleTargetRef"] = {}
+        if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil?
+          item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"]
+        end
+        if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil?
+          item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"]
+        end
+      end
+      item["status"] = {}
+      if !resourceItem["status"].nil?
+        if !resourceItem["status"]["currentReplicas"].nil?
+          item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"]
+        end
+        if !resourceItem["status"]["desiredReplicas"].nil?
+          item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"]
+        end
+        if !resourceItem["status"]["lastScaleTime"].nil?
+          item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"]
+        end
+      end
+      return item
+    end
   end
 end
diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb
new file mode 100644
index 000000000..6633d26d5
--- /dev/null
+++ b/source/plugins/ruby/WatchStream.rb
@@ -0,0 +1,63 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "net/http"
+require "net/https"
+require "yajl/json_gem"
+require "logger"
+require "time"
+
+WATCH_ARGUMENTS = {
+  "labelSelector" => :label_selector,
+  "fieldSelector" => :field_selector,
+  "resourceVersion" => :resource_version,
+  "allowWatchBookmarks" => :allow_watch_bookmarks,
+  "timeoutSeconds" => :timeout_seconds,
+}.freeze
+
+# HTTP Stream used to watch changes on entities
+class WatchStream
+  def initialize(uri, http_options, http_headers, logger)
+    @uri = uri
+    @http_client = nil
+    @http_options = http_options
+    @http_headers = http_headers
+    @logger = logger
+    @logger.info "WatchStream:initialize @ #{Time.now.utc.iso8601}"
+  end
+
+  def each
+    @finished = false
+    buffer = +""
+    @logger.info "WatchStream: Opening TCP session  @ #{Time.now.utc.iso8601}"
+    @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options)
+    path = @uri.path
+    if !@uri.query.nil? && !@uri.query.empty?
+      path += "?" + @uri.query
+    end
+    @logger.info "WatchStream: Making GET API call for Watch with path: #{path}  @ #{Time.now.utc.iso8601}"
+    @http_client.request_get(path, @http_headers) do |response|
+      if !response.nil? && response.code.to_i > 300
+        raise "WatchStream: watch connection failed with an http status code: #{response.code}"
+      end
+      response.read_body do |chunk|
+        buffer << chunk
+        while (line = buffer.slice!(/.+\n/))
+          yield(Yajl::Parser.parse(StringIO.new(line.chomp)))
+        end
+      end
+    end
+  rescue => e
+    raise e
+  end
+
+  def finish
+    begin
+      @finished = true
+      @logger.info "WatchStream:finish HTTP session @ #{Time.now.utc.iso8601}"
+      @http_client.finish if !@http_client.nil? && @http_client.started?
+    rescue => error
+      @logger.warn "WatchStream:finish failed with an error: #{error} @ #{Time.now.utc.iso8601}"
+    end
+  end
+end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index a32a32769..707cfbf9d 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -1,17 +1,17 @@
 #!/usr/local/bin/ruby
 # frozen_string_literal: true
 
-require 'fluent/plugin/input'
+require "fluent/plugin/input"
 
 module Fluent::Plugin
   class Kube_nodeInventory_Input < Input
     Fluent::Plugin.register_input("kube_nodes", self)
 
-    def initialize (kubernetesApiClient=nil, 
-                    applicationInsightsUtility=nil, 
-                    extensionUtils=nil, 
-                    env=nil, 
-                    telemetry_flush_interval=nil)
+    def initialize(kubernetesApiClient = nil,
+                   applicationInsightsUtility = nil,
+                   extensionUtils = nil,
+                   env = nil,
+                   telemetry_flush_interval = nil)
       super()
 
       require "yaml"
@@ -36,8 +36,7 @@ def initialize (kubernetesApiClient=nil,
       @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings"
       @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration"
       @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json"
-  
-  
+
       @@rsPromInterval = @env["TELEMETRY_RS_PROM_INTERVAL"]
       @@rsPromFieldPassCount = @env["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"]
       @@rsPromFieldDropCount = @env["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"]
@@ -64,6 +63,9 @@ def initialize (kubernetesApiClient=nil,
       require_relative "constants"
 
       @NodeCache = NodeStatsCache.new()
+      @watchNodesThread = nil
+      @nodeItemsCache = {}
+      #@nodeItemsCacheSizeKB = 0
     end
 
     config_param :run_interval, :time, :default => 60
@@ -97,6 +99,8 @@ def start
         @finished = false
         @condition = ConditionVariable.new
         @mutex = Mutex.new
+        @nodeCacheMutex = Mutex.new
+        @watchNodesThread = Thread.new(&method(:watch_nodes))
         @thread = Thread.new(&method(:run_periodic))
         @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i
         @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -110,6 +114,7 @@ def shutdown
           @condition.signal
         }
         @thread.join
+        @watchNodesThread.join
         super # This super must be at the end of shutdown method
       end
     end
@@ -138,7 +143,7 @@ def enumerate
           if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE)
           end
-	        $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
@@ -147,11 +152,11 @@ def enumerate
 
         # Initializing continuation token to nil
         continuationToken = nil
-        $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
-        # KubernetesApiClient.getNodesResourceUri is a pure function, so call it from the actual module instead of from the mock
-        resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
-        continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-        $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+        nodeInventory = {}
+        @nodeCacheMutex.synchronize {
+          nodeInventory["items"] = @nodeItemsCache.values.clone
+          #@nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
+        }
         nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
         if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
@@ -160,21 +165,6 @@ def enumerate
         else
           $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory"
         end
-
-        #If we receive a continuation token, make calls, process and flush data until we have processed all data
-        while (!continuationToken.nil? && !continuationToken.empty?)
-          nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i
-          continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
-          nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
-          @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
-          if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-            $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
-            parse_and_emit_records(nodeInventory, batchTime)
-          else
-            $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory"
-          end
-        end
-
         @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime)
         timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs
         timeDifferenceInMinutes = timeDifference / 60
@@ -312,80 +302,80 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
           # Adding telemetry to send node telemetry every 10 minutes
           timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs
           timeDifferenceInMinutes = timeDifference / 60
-          if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
-            begin
-              properties = getNodeTelemetryProps(item)
-              properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"]
-              capacityInfo = item["status"]["capacity"]
-
-              ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)
-              begin
-                if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?)
-                  properties["nvigpus"] = capacityInfo["nvidia.com/gpu"]
-                end
+          #if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+          begin
+            properties = getNodeTelemetryProps(item)
+            properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"]
+            capacityInfo = item["status"]["capacity"]
 
-                if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?)
-                  properties["amdgpus"] = capacityInfo["amd.com/gpu"]
-                end
-              rescue => errorStr
-                $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}"
-                $log.debug_backtrace(errorStr.backtrace)
-                ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+            ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)
+            begin
+              if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?)
+                properties["nvigpus"] = capacityInfo["nvidia.com/gpu"]
               end
 
-              # Telemetry for data collection config for replicaset
-              if (File.file?(@@configMapMountPath))
-                properties["collectAllKubeEvents"] = @@collectAllKubeEvents
+              if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?)
+                properties["amdgpus"] = capacityInfo["amd.com/gpu"]
               end
+            rescue => errorStr
+              $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}"
+              $log.debug_backtrace(errorStr.backtrace)
+              ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+            end
 
-              #telemetry about prometheus metric collections settings for replicaset
-              if (File.file?(@@promConfigMountPath))
-                properties["rsPromInt"] = @@rsPromInterval
-                properties["rsPromFPC"] = @@rsPromFieldPassCount
-                properties["rsPromFDC"] = @@rsPromFieldDropCount
-                properties["rsPromServ"] = @@rsPromK8sServiceCount
-                properties["rsPromUrl"] = @@rsPromUrlCount
-                properties["rsPromMonPods"] = @@rsPromMonitorPods
-                properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
-                properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
-                properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
-              end
-              # telemetry about osm metric settings for replicaset
-              if (File.file?(@@osmConfigMountPath))
-                properties["osmNamespaceCount"] = @@osmNamespaceCount
-              end
-              ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
-              telemetrySent = true
+            # Telemetry for data collection config for replicaset
+            if (File.file?(@@configMapMountPath))
+              properties["collectAllKubeEvents"] = @@collectAllKubeEvents
+            end
 
-              # Telemetry for data collection config for replicaset
-              if (File.file?(@@configMapMountPath))
-                properties["collectAllKubeEvents"] = @@collectAllKubeEvents
-              end
+            #telemetry about prometheus metric collections settings for replicaset
+            if (File.file?(@@promConfigMountPath))
+              properties["rsPromInt"] = @@rsPromInterval
+              properties["rsPromFPC"] = @@rsPromFieldPassCount
+              properties["rsPromFDC"] = @@rsPromFieldDropCount
+              properties["rsPromServ"] = @@rsPromK8sServiceCount
+              properties["rsPromUrl"] = @@rsPromUrlCount
+              properties["rsPromMonPods"] = @@rsPromMonitorPods
+              properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
+              properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
+              properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
+            end
+            # telemetry about osm metric settings for replicaset
+            if (File.file?(@@osmConfigMountPath))
+              properties["osmNamespaceCount"] = @@osmNamespaceCount
+            end
+            ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
+            telemetrySent = true
 
-              #telemetry about prometheus metric collections settings for replicaset
-              if (File.file?(@@promConfigMountPath))
-                properties["rsPromInt"] = @@rsPromInterval
-                properties["rsPromFPC"] = @@rsPromFieldPassCount
-                properties["rsPromFDC"] = @@rsPromFieldDropCount
-                properties["rsPromServ"] = @@rsPromK8sServiceCount
-                properties["rsPromUrl"] = @@rsPromUrlCount
-                properties["rsPromMonPods"] = @@rsPromMonitorPods
-                properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
-                properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
-                properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
-              end
-              # telemetry about osm metric settings for replicaset
-              if (File.file?(@@osmConfigMountPath))
-                properties["osmNamespaceCount"] = @@osmNamespaceCount
-              end
-              @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
-              telemetrySent = true
-            rescue => errorStr
-              $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}"
-              $log.debug_backtrace(errorStr.backtrace)
-              @applicationInsightsUtility.sendExceptionTelemetry(errorStr)
+            # Telemetry for data collection config for replicaset
+            if (File.file?(@@configMapMountPath))
+              properties["collectAllKubeEvents"] = @@collectAllKubeEvents
+            end
+
+            #telemetry about prometheus metric collections settings for replicaset
+            if (File.file?(@@promConfigMountPath))
+              properties["rsPromInt"] = @@rsPromInterval
+              properties["rsPromFPC"] = @@rsPromFieldPassCount
+              properties["rsPromFDC"] = @@rsPromFieldDropCount
+              properties["rsPromServ"] = @@rsPromK8sServiceCount
+              properties["rsPromUrl"] = @@rsPromUrlCount
+              properties["rsPromMonPods"] = @@rsPromMonitorPods
+              properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
+              properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
+              properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
             end
+            # telemetry about osm metric settings for replicaset
+            if (File.file?(@@osmConfigMountPath))
+              properties["osmNamespaceCount"] = @@osmNamespaceCount
+            end
+            @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
+            telemetrySent = true
+          rescue => errorStr
+            $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}"
+            $log.debug_backtrace(errorStr.backtrace)
+            @applicationInsightsUtility.sendExceptionTelemetry(errorStr)
           end
+          #end
         end
         if telemetrySent == true
           @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -566,18 +556,121 @@ def getNodeTelemetryProps(item)
         end
         properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE
         properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE
+        #properties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB
       rescue => errorStr
         $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}"
       end
       return properties
     end
+
+    def watch_nodes
+      nodesResourceVersion = nil
+      loop do
+        begin
+          if nodesResourceVersion.nil?
+            # clear cache before filling the cache with list
+            @nodeCacheMutex.synchronize {
+              @nodeItemsCache.clear()
+            }
+            continuationToken = nil
+            $log.info("in_kube_nodes::watch_nodes : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
+            $log.info("in_kube_nodes::watch_nodes : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            if (!nodeInventory.nil? && !nodeInventory.empty?)
+              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                nodeInventory["items"].each do |item|
+                  key = item["metadata"]["uid"]
+                  nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                  @nodeCacheMutex.synchronize {
+                    @nodeItemsCache[key] = nodeItem
+                  }
+                end
+              end
+            else
+              $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory"
+            end
+            while (!continuationToken.nil? && !continuationToken.empty?)
+              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
+              if (!nodeInventory.nil? && !nodeInventory.empty?)
+                nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                  $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  nodeInventory["items"].each do |item|
+                    key = item["metadata"]["uid"]
+                    nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                    @nodeCacheMutex.synchronize {
+                      @nodeItemsCache[key] = nodeItem
+                    }
+                  end
+                end
+              else
+                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory"
+              end
+            end
+          end
+          $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+          watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+          if watcher.nil?
+            $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+          else
+            watcher.each do |notice|
+              case notice["type"]
+              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                item = notice["object"]
+                # extract latest resource version to use for watch reconnect
+                if !item.nil? && !item.empty? &&
+                   !item["metadata"].nil? && !item["metadata"].empty? &&
+                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                  nodesResourceVersion = item["metadata"]["resourceVersion"]
+                  $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                else
+                  $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                  nodesResourceVersion = nil
+                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  break
+                end
+                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                  key = item["metadata"]["uid"]
+                  nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                  @nodeCacheMutex.synchronize {
+                    @nodeItemsCache[key] = nodeItem
+                  }
+                elsif notice["type"] == "DELETED"
+                  key = item["metadata"]["uid"]
+                  @nodeCacheMutex.synchronize {
+                    @nodeItemsCache.delete(key)
+                  }
+                end
+              when "ERROR"
+                nodesResourceVersion = nil
+                $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                break
+              else
+                $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+              end
+            end
+          end
+        rescue Net::ReadTimeout => errorStr
+          $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+        rescue => errorStr
+          $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          nodesResourceVersion = nil
+          sleep(5) # do not overwhelm the api-server if api-server broken
+        ensure
+          watcher.finish if watcher
+        end
+      end
+    end
   end # Kube_Node_Input
+
   class NodeStatsCache
     # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class)
     # (to reduce code duplication)
     class NodeCache
-
-      @@RECORD_TIME_TO_LIVE = 60*20  # units are seconds, so clear the cache every 20 minutes.
+      @@RECORD_TIME_TO_LIVE = 60 * 20  # units are seconds, so clear the cache every 20 minutes.
 
       def initialize
         @cacheHash = {}
@@ -622,7 +715,7 @@ def clean_cache()
             end
           end
 
-          nodes_to_remove.each {|node_name|
+          nodes_to_remove.each { |node_name|
             @cacheHash.delete(node_name)
             @timeAdded.delete(node_name)
           }
@@ -630,7 +723,6 @@ def clean_cache()
       end
     end  # NodeCache
 
-
     @@cpuCache = NodeCache.new
     @@memCache = NodeCache.new
 
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 3f5f4f1cc..1ed91d9cf 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -1,7 +1,7 @@
 #!/usr/local/bin/ruby
 # frozen_string_literal: true
 
-require 'fluent/plugin/input'
+require "fluent/plugin/input"
 
 module Fluent::Plugin
   require_relative "podinventory_to_mdm"
@@ -12,7 +12,6 @@ class Kube_PodInventory_Input < Input
     @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
     @@hostName = (OMS::Common.get_hostname)
 
-
     def initialize
       super
       require "yaml"
@@ -20,6 +19,7 @@ def initialize
       require "yajl"
       require "set"
       require "time"
+      require "net/http"
 
       require_relative "kubernetes_container_inventory"
       require_relative "KubernetesApiClient"
@@ -41,6 +41,11 @@ def initialize
       @controllerData = {}
       @podInventoryE2EProcessingLatencyMs = 0
       @podsAPIE2ELatencyMs = 0
+      @watchPodsThread = nil
+      @podItemsCache = {}
+
+      @watchServicesThread = nil
+      @serviceItemsCache = {}
 
       @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
@@ -79,7 +84,11 @@ def start
         @finished = false
         @condition = ConditionVariable.new
         @mutex = Mutex.new
+        @podCacheMutex = Mutex.new
+        @serviceCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
+        @watchPodsThread = Thread.new(&method(:watch_pods))
+        @watchServicesThread = Thread.new(&method(:watch_services))
         @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
       end
     end
@@ -91,6 +100,7 @@ def shutdown
           @condition.signal
         }
         @thread.join
+        @watchPodsThread.join
         super # This super must be at the end of shutdown method
       end
     end
@@ -110,55 +120,49 @@ def enumerate(podList = nil)
         @podInventoryE2EProcessingLatencyMs = 0
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
-           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
-           if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-             @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
-           end
-           if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-             @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE)
-           end
-           if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-             @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE)
-           end
-           if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-             @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
-           end
-           if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-             @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE)
-           end
-	         $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
-           $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}")
-           $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}")
-           $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
-           $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
+          if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
+          end
+          if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE)
+          end
+          if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE)
+          end
+          if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
+          end
+          if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE)
+          end
+          $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
         end
 
-        # Get services first so that we dont need to make a call for very chunk
-        $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}")
-        serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
-        # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body)
-        $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}")
-
-        if !serviceInfo.nil?
-          $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
-          serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
-          $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
-          serviceInfo = nil
-          # service inventory records much smaller and fixed size compared to serviceList
-          serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime)
-          # updating for telemetry
-          @serviceCount += serviceRecords.length
-          serviceList = nil
-        end
+        serviceInventory = {}
+        @serviceCacheMutex.synchronize {
+          serviceInventory["items"] = @serviceItemsCache.values.clone
+        }
+        serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime)
+        # updating for telemetry
+        @serviceCount = serviceRecords.length
+        $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}")
 
         # to track e2e processing latency
         @podsAPIE2ELatencyMs = 0
         podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
         # Initializing continuation token to nil
         continuationToken = nil
-        $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
-        continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
-        $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+        #podItemsCacheSizeKB = 0
+        podInventory = {}
+        @podCacheMutex.synchronize {
+          podInventory["items"] = @podItemsCache.values.clone
+          #podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024
+        }
         podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
         if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
@@ -167,21 +171,6 @@ def enumerate(podList = nil)
         else
           $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
         end
-
-        #If we receive a continuation token, make calls, process and flush data until we have processed all data
-        while (!continuationToken.nil? && !continuationToken.empty?)
-          podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
-          continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
-          podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
-          @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime)
-          if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-            $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
-            parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
-          else
-            $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
-          end
-        end
-
         @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
@@ -195,11 +184,13 @@ def enumerate(podList = nil)
         end
 
         # Flush AppInsights telemetry once all the processing is done
+        telemetryFlush = true
         if telemetryFlush == true
           telemetryProperties = {}
           telemetryProperties["Computer"] = @@hostName
           telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE
           telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE
+          #telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
           ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
           ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {})
@@ -673,5 +664,203 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords)
       end
       return serviceName
     end
+
+    def watch_pods
+      podsResourceVersion = nil
+      isCheckedWindowsNodes = false
+      loop do
+        begin
+          # check if the cluster has windows nodes since windows container records requires inventory specific fields
+          winNodes = KubernetesApiClient.getWindowsNodesArray()
+          if !isCheckedWindowsNodes && winNodes.empty?
+            winNodes = KubernetesApiClient.getWindowsNodes()
+            isCheckedWindowsNodes = true
+          end
+          if podsResourceVersion.nil?
+            # clear cache before filling the cache with list
+            @podCacheMutex.synchronize {
+              @podItemsCache.clear()
+            }
+            continuationToken = nil
+            $log.info("in_kube_podinventory::watch_pods : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
+            continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
+            $log.info("in_kube_podinventory::watch_pods : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+            if (!podInventory.nil? && !podInventory.empty?)
+              podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+              if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                podInventory["items"].each do |item|
+                  key = item["metadata"]["uid"]
+                  podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                  @podCacheMutex.synchronize {
+                    @podItemsCache[key] = podItem
+                  }
+                end
+              end
+            else
+              $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
+            end
+            while (!continuationToken.nil? && !continuationToken.empty?)
+              continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
+              if (!podInventory.nil? && !podInventory.empty?)
+                podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+                if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                  $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  podInventory["items"].each do |item|
+                    key = item["metadata"]["uid"]
+                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                    @podCacheMutex.synchronize {
+                      @podItemsCache[key] = podItem
+                    }
+                  end
+                end
+              else
+                $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
+              end
+            end
+          end
+          $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+          watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+          if watcher.nil?
+            $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+          else
+            watcher.each do |notice|
+              case notice["type"]
+              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                item = notice["object"]
+                # extract latest resource version to use for watch reconnect
+                if !item.nil? && !item.empty? &&
+                   !item["metadata"].nil? && !item["metadata"].empty? &&
+                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                  podsResourceVersion = item["metadata"]["resourceVersion"]
+                  $log.info("in_kube_podinventory::watch_pods: received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                else
+                  $log.info("in_kube_podinventory::watch_pods: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                  podsResourceVersion = nil
+                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  break
+                end
+                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                  key = item["metadata"]["uid"]
+                  podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                  @podCacheMutex.synchronize {
+                    @podItemsCache[key] = podItem
+                  }
+                elsif notice["type"] == "DELETED"
+                  key = item["metadata"]["uid"]
+                  @podCacheMutex.synchronize {
+                    @podItemsCache.delete(key)
+                  }
+                end
+              when "ERROR"
+                podsResourceVersion = nil
+                $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                break
+              else
+                $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+              end
+            end
+          end
+        rescue Net::ReadTimeout => errorStr
+          $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+        rescue => errorStr
+          $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          podsResourceVersion = nil
+          sleep(5) # do not overwhelm the api-server if api-server broken
+        ensure
+          watcher.finish if watcher
+        end
+      end
+    end
+
+    def watch_services
+      servicesResourceVersion = nil
+      loop do
+        begin
+          if servicesResourceVersion.nil?
+            # clear cache before filling the cache with list
+            @serviceCacheMutex.synchronize {
+              @serviceItemsCache.clear()
+            }
+            $log.info("in_kube_podinventory::watch_services : Getting services from Kube API @ #{Time.now.utc.iso8601}")
+            serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
+            $log.info("in_kube_podinventory::watch_services : Done getting services from Kube API @ #{Time.now.utc.iso8601}")
+            if !serviceInfo.nil?
+              $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+              serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
+              $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+              serviceInfo = nil
+              if (!serviceInventory.nil? && !serviceInventory.empty?)
+                servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"]
+                if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?)
+                  $log.info("in_kube_podinventory::watch_services : number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
+                  serviceInventory["items"].each do |item|
+                    key = item["metadata"]["uid"]
+                    serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                    @serviceCacheMutex.synchronize {
+                      @serviceItemsCache[key] = serviceItem
+                    }
+                  end
+                end
+              else
+                $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory"
+              end
+              serviceInventory = nil
+            end
+          end
+
+          $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+          watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
+          if watcher.nil?
+            $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+          else
+            watcher.each do |notice|
+              case notice["type"]
+              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                item = notice["object"]
+                # extract latest resource version to use for watch reconnect
+                if !item.nil? && !item.empty? &&
+                   !item["metadata"].nil? && !item["metadata"].empty? &&
+                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                  servicesResourceVersion = item["metadata"]["resourceVersion"]
+                  $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+                else
+                  $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                  servicesResourceVersion = nil
+                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  break
+                end
+                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                  key = item["metadata"]["uid"]
+                  serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                  @serviceCacheMutex.synchronize {
+                    @serviceItemsCache[key] = serviceItem
+                  }
+                elsif notice["type"] == "DELETED"
+                  key = item["metadata"]["uid"]
+                  @serviceCacheMutex.synchronize {
+                    @serviceItemsCache.delete(key)
+                  }
+                end
+              when "ERROR"
+                servicesResourceVersion = nil
+                $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                break
+              else
+                $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+              end
+            end
+          end
+        rescue Net::ReadTimeout => errorStr
+          $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+        rescue => errorStr
+          $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          servicesResourceVersion = nil
+          sleep(5) # do not overwhelm the api-server if api-server broken
+        ensure
+          watcher.finish if watcher
+        end
+      end
+    end
   end # Kube_Pod_Input
 end # module
diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb
index 82e36c8cc..ffe92ec40 100644
--- a/source/plugins/ruby/kubernetes_container_inventory.rb
+++ b/source/plugins/ruby/kubernetes_container_inventory.rb
@@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !atLocation.nil?
                 containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1]
               end
-            end            
+            end
             containerInventoryRecord["ExitCode"] = 0
             isContainerTerminated = false
             isContainerWaiting = false
@@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
             end
 
             containerInfoMap = containersInfoMap[containerName]
-            # image can be in any one of below format in spec 
-            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image                       
+            # image can be in any one of below format in spec
+            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image
             imageValue = containerInfoMap["image"]
             if !imageValue.nil? && !imageValue.empty?
               # Find delimiters in image format
               atLocation = imageValue.index("@")
-              isDigestSpecified = false 
+              isDigestSpecified = false
               if !atLocation.nil?
                 # repository/image@digest or repository/image:imagetag@digest, image@digest
                 imageValue = imageValue[0..(atLocation - 1)]
                 # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc.
                 if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty?
-                   containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] 
+                  containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1]
                 end
                 isDigestSpecified = true
               end
@@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !colonLocation.nil?
                 if slashLocation.nil?
                   # image:imagetag
-                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]                 
+                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
                 else
                   # repository/image:imagetag
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)]
                 end
                 containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1]
-              else 
+              else
                 if slashLocation.nil?
                   # image
                   containerInventoryRecord["Image"] = imageValue
@@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
                   # repo/image
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1]
-                end 
+                end
                 # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status.
                 # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names
-                if isDigestSpecified == false 
+                if isDigestSpecified == false
                   containerInventoryRecord["ImageTag"] = "latest"
                 end
-              end           
+              end
             end
-           
+
             podName = containerInfoMap["PodName"]
             namespace = containerInfoMap["Namespace"]
             # containername in the format what docker sees
@@ -199,7 +199,11 @@ def getContainersInfoMap(podItem, isWindows)
               cmdValue = container["command"]
               cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s
               containerInfoMap["Command"] = cmdValueString
-              containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+              if isWindows
+                containerInfoMap["EnvironmentVar"] = container["env"]
+              else
+                containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+              end
               containersInfoMap[containerName] = containerInfoMap
             end
           end
@@ -212,47 +216,47 @@ def getContainersInfoMap(podItem, isWindows)
       return containersInfoMap
     end
 
-    def obtainContainerEnvironmentVars(containerId)    
+    def obtainContainerEnvironmentVars(containerId)
       envValueString = ""
       begin
-        isCGroupPidFetchRequired = false 
+        isCGroupPidFetchRequired = false
         if !@@containerCGroupCache.has_key?(containerId)
-          isCGroupPidFetchRequired = true 
+          isCGroupPidFetchRequired = true
         else
           cGroupPid = @@containerCGroupCache[containerId]
-          if cGroupPid.nil? || cGroupPid.empty?            
+          if cGroupPid.nil? || cGroupPid.empty?
             isCGroupPidFetchRequired = true
             @@containerCGroupCache.delete(containerId)
-          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")              
+          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
             isCGroupPidFetchRequired = true
-            @@containerCGroupCache.delete(containerId)                       
-          end        
+            @@containerCGroupCache.delete(containerId)
+          end
         end
 
-        if isCGroupPidFetchRequired         
+        if isCGroupPidFetchRequired
           Dir["/hostfs/proc/*/cgroup"].each do |filename|
             begin
               if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any?
                 # file full path is /hostfs/proc/<cGroupPid>/cgroup
-                cGroupPid = filename.split("/")[3]  
-                if is_number?(cGroupPid)                              
+                cGroupPid = filename.split("/")[3]
+                if is_number?(cGroupPid)
                   if @@containerCGroupCache.has_key?(containerId)
-                    tempCGroupPid = @@containerCGroupCache[containerId]                  
+                    tempCGroupPid = @@containerCGroupCache[containerId]
                     if tempCGroupPid.to_i > cGroupPid.to_i
                       @@containerCGroupCache[containerId] = cGroupPid
                     end
                   else
                     @@containerCGroupCache[containerId] = cGroupPid
-                  end                        
+                  end
                 end
               end
-            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read              
-            end          
-          end        
+            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
+            end
+          end
         end
         cGroupPid = @@containerCGroupCache[containerId]
         if !cGroupPid.nil? && !cGroupPid.empty?
-          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"       
+          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
           if File.exist?(environFilePath)
             # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
             # Check to see if the environment variable collection is disabled for this container.
@@ -265,7 +269,7 @@ def obtainContainerEnvironmentVars(containerId)
               if !envVars.nil? && !envVars.empty?
                 envVars = envVars.split("\0")
                 envValueString = envVars.to_json
-                envValueStringLength = envValueString.length              
+                envValueStringLength = envValueString.length
                 if envValueStringLength >= 200000
                   lastIndex = envValueString.rindex("\",")
                   if !lastIndex.nil?
@@ -376,6 +380,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId)
         ApplicationInsightsUtility.sendExceptionTelemetry(error)
       end
     end
+
     def is_number?(value)
       true if Integer(value) rescue false
     end

From 30decbb1868de03798c4634f77eccfd7971a1938 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 3 Jan 2022 19:20:09 -0800
Subject: [PATCH 02/65] fix weird bug

---
 source/plugins/ruby/KubernetesApiClient.rb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 319129cae..40f80886a 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -1000,7 +1000,6 @@ def getPodOptimizedItem(resourceItem, winNodes)
         end
         item["status"] = {}
         if !resourceItem["status"].nil?
-          item["status"] = resourceItem["status"]
           if !resourceItem["status"]["startTime"].nil?
             item["status"]["startTime"] = resourceItem["status"]["startTime"]
           end

From 540ca90ce032c216ac2bd66069fe0807ad0c60d4 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 4 Jan 2022 18:14:50 -0800
Subject: [PATCH 03/65] multiproc support for fluentd

---
 build/linux/installer/conf/kube.conf |  12 +-
 kubernetes/linux/main.sh             | 380 ++++++++++++++-------------
 kubernetes/omsagent.yaml             |   4 +-
 3 files changed, 213 insertions(+), 183 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 1340a27a4..ac9735e20 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -1,6 +1,6 @@
    #fluent forward plugin
    <system>
-    workers 2
+    workers "#{ENV['NUM_OF_FLUENTD_WORKERS']}"
     root_dir /var/opt/microsoft/docker-cimprov/state
    </system>
 
@@ -64,7 +64,7 @@
     </match>
 
 
-  <worker 1>
+  <worker "#{ENV['FLUENTD_POD_INVENTORY_WORKER_ID']}">
     #Kubernetes pod inventory
     <source>
      @type kube_podinventory
@@ -143,7 +143,8 @@
      </buffer>
      retry_mdm_post_wait_minutes 30
     </match>
-
+  </worker>
+  <worker "#{ENV['FLUENTD_NODE_INVENTORY_WORKER_ID']}">
     #Kubernetes Nodes
     <source>
      @type kube_nodes
@@ -228,8 +229,7 @@
      retry_mdm_post_wait_minutes 30
     </match>
   </worker>
-
-  <worker 0>
+  <worker "#{ENV['FLUENTD_OTHER_INVENTORY_WORKER_ID']}">
     #fluent forward plugin
     <source>
      @type forward
@@ -414,4 +414,4 @@
      </buffer>
      retry_mdm_post_wait_minutes 30
     </match>
-  </worker>
+  </worker>
\ No newline at end of file
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index a9184ab53..023cc11e4 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -15,8 +15,7 @@ waitforlisteneronTCPport() {
 
             if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then
                   #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"')
-                  while true
-                  do
+                  while true; do
                         if [ $totalsleptsecs -gt $waittimesecs ]; then
                               echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs"
                               return 1
@@ -25,7 +24,7 @@ waitforlisteneronTCPport() {
                         if [ -z "$varlistener" ]; then
                               #echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..."
                               sleep $sleepdurationsecs
-                              totalsleptsecs=$(($totalsleptsecs+1))
+                              totalsleptsecs=$(($totalsleptsecs + 1))
                         else
                               echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs"
                               return 0
@@ -57,23 +56,22 @@ checkAgentOnboardingStatus() {
                         successMessage="Loaded data sources"
                         failureMessage="Failed to load data sources into config"
                   fi
-                  while true
-                  do
-                     if [ $totalsleptsecs -gt $waittimesecs ]; then
-                        echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs"
-                        return 1
-                     fi
-
-                     if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then
-                        echo "Onboarding success"
-                        return 0
-                     elif  grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then
-                        echo "Onboarding Failure: Reason: Failed to onboard the agent"
-                        echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion"
-                        return 1
-                     fi
-                     sleep $sleepdurationsecs
-                     totalsleptsecs=$(($totalsleptsecs+1))
+                  while true; do
+                        if [ $totalsleptsecs -gt $waittimesecs ]; then
+                              echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs"
+                              return 1
+                        fi
+
+                        if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then
+                              echo "Onboarding success"
+                              return 0
+                        elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then
+                              echo "Onboarding Failure: Reason: Failed to onboard the agent"
+                              echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion"
+                              return 1
+                        fi
+                        sleep $sleepdurationsecs
+                        totalsleptsecs=$(($totalsleptsecs + 1))
                   done
             else
                   echo "${FUNCNAME[0]} called with non-numeric arguments<$2>. Required arguments <#wait-time-in-seconds>"
@@ -82,7 +80,6 @@ checkAgentOnboardingStatus() {
       fi
 }
 
-
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
 mkdir -p /var/opt/microsoft/docker-cimprov/state
 
@@ -90,8 +87,8 @@ mkdir -p /var/opt/microsoft/docker-cimprov/state
 inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s'
 
 #Run inotify as a daemon to track changes to the mounted configmap for OSM settings.
-if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) ||
-      ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then
+if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) ||
+      ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then
       inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s'
 fi
 
@@ -100,58 +97,58 @@ if [ -z $AKS_RESOURCE_ID ]; then
       echo "not setting customResourceId"
 else
       export customResourceId=$AKS_RESOURCE_ID
-      echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc
+      echo "export customResourceId=$AKS_RESOURCE_ID" >>~/.bashrc
       source ~/.bashrc
       echo "customResourceId:$customResourceId"
       export customRegion=$AKS_REGION
-      echo "export customRegion=$AKS_REGION" >> ~/.bashrc
+      echo "export customRegion=$AKS_REGION" >>~/.bashrc
       source ~/.bashrc
       echo "customRegion:$customRegion"
 fi
 
 #set agent config schema version
-if [  -e "/etc/config/settings/schema-version" ] && [  -s "/etc/config/settings/schema-version" ]; then
+if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then
       #trim
       config_schema_version="$(cat /etc/config/settings/schema-version | xargs)"
       #remove all spaces
       config_schema_version="${config_schema_version//[[:space:]]/}"
       #take first 10 characters
-      config_schema_version="$(echo $config_schema_version| cut -c1-10)"
+      config_schema_version="$(echo $config_schema_version | cut -c1-10)"
 
       export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version
-      echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc
+      echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >>~/.bashrc
       source ~/.bashrc
       echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION"
 fi
 
 #set agent config file version
-if [  -e "/etc/config/settings/config-version" ] && [  -s "/etc/config/settings/config-version" ]; then
+if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then
       #trim
       config_file_version="$(cat /etc/config/settings/config-version | xargs)"
       #remove all spaces
       config_file_version="${config_file_version//[[:space:]]/}"
       #take first 10 characters
-      config_file_version="$(echo $config_file_version| cut -c1-10)"
+      config_file_version="$(echo $config_file_version | cut -c1-10)"
 
       export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version
-      echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc
+      echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >>~/.bashrc
       source ~/.bashrc
       echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION"
 fi
 
 #set OSM config schema version
-if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) ||
-      ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then
-      if [  -e "/etc/config/osm-settings/schema-version" ] && [  -s "/etc/config/osm-settings/schema-version" ]; then
+if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) ||
+      ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then
+      if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then
             #trim
             osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)"
             #remove all spaces
             osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}"
             #take first 10 characters
-            osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)"
+            osm_config_schema_version="$(echo $osm_config_schema_version | cut -c1-10)"
 
             export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version
-            echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc
+            echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >>~/.bashrc
             source ~/.bashrc
             echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION"
       fi
@@ -175,7 +172,7 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
             # convert the protocol prefix in lowercase for validation
             proxyprotocol=$(echo $proto | tr "[:upper:]" "[:lower:]")
             if [ "$proxyprotocol" != "http://" -a "$proxyprotocol" != "https://" ]; then
-               echo "-e error proxy endpoint should be in this format http(s)://<user>:<pwd>@<hostOrIP>:<port>"
+                  echo "-e error proxy endpoint should be in this format http(s)://<user>:<pwd>@<hostOrIP>:<port>"
             fi
             # remove the protocol
             url="$(echo ${PROXY_ENDPOINT/$proto/})"
@@ -191,53 +188,53 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
             port="$(echo $hostport | sed -e 's,^.*:,:,g' -e 's,.*:\([0-9]*\).*,\1,g' -e 's,[^0-9],,g')"
 
             if [ -z "$user" -o -z "$pwd" -o -z "$host" -o -z "$port" ]; then
-               echo "-e error proxy endpoint should be in this format http(s)://<user>:<pwd>@<hostOrIP>:<port>"
+                  echo "-e error proxy endpoint should be in this format http(s)://<user>:<pwd>@<hostOrIP>:<port>"
             else
-               echo "successfully validated provided proxy endpoint is valid and expected format"
+                  echo "successfully validated provided proxy endpoint is valid and expected format"
             fi
 
-            echo $pwd > /opt/microsoft/docker-cimprov/proxy_password
+            echo $pwd >/opt/microsoft/docker-cimprov/proxy_password
 
             export MDSD_PROXY_MODE=application
-            echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc
+            echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >>~/.bashrc
             export MDSD_PROXY_ADDRESS=$proto$hostport
-            echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc
+            echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >>~/.bashrc
             export MDSD_PROXY_USERNAME=$user
-            echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc
+            echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >>~/.bashrc
             export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password
-            echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc
-            
+            echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc
+
             #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD
             export MDSD_ODS_COMPRESSION_LEVEL=0
-            echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc
+            echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >>~/.bashrc
       fi
 
       if [ ! -z "$PROXY_ENDPOINT" ]; then
-         echo "Making curl request to oms endpint with domain: $domain and proxy: $PROXY_ENDPOINT"
-         curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
+            echo "Making curl request to oms endpint with domain: $domain and proxy: $PROXY_ENDPOINT"
+            curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
       else
-         echo "Making curl request to oms endpint with domain: $domain"
-         curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
+            echo "Making curl request to oms endpint with domain: $domain"
+            curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
       fi
 
       if [ $? -ne 0 ]; then
             if [ ! -z "$PROXY_ENDPOINT" ]; then
-               echo "Making curl request to ifconfig.co with proxy: $PROXY_ENDPOINT"
-               RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT`
+                  echo "Making curl request to ifconfig.co with proxy: $PROXY_ENDPOINT"
+                  RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT)
             else
-               echo "Making curl request to ifconfig.co"
-               RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co`
+                  echo "Making curl request to ifconfig.co"
+                  RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co)
             fi
             if [ $RET -eq 000 ]; then
                   echo "-e error    Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster"
             else
                   # Retrying here to work around network timing issue
                   if [ ! -z "$PROXY_ENDPOINT" ]; then
-                    echo "ifconfig check succeeded, retrying oms endpoint with proxy..."
-                    curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
+                        echo "ifconfig check succeeded, retrying oms endpoint with proxy..."
+                        curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT
                   else
-                    echo "ifconfig check succeeded, retrying oms endpoint..."
-                    curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
+                        echo "ifconfig check succeeded, retrying oms endpoint..."
+                        curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest
                   fi
 
                   if [ $? -ne 0 ]; then
@@ -253,59 +250,57 @@ else
       echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check"
 fi
 
-
 # Set environment variable for if public cloud by checking the workspace domain.
 if [ -z $domain ]; then
-  ClOUD_ENVIRONMENT="unknown"
+      ClOUD_ENVIRONMENT="unknown"
 elif [ $domain == "opinsights.azure.com" ]; then
-  CLOUD_ENVIRONMENT="azurepubliccloud"
+      CLOUD_ENVIRONMENT="azurepubliccloud"
 elif [ $domain == "opinsights.azure.cn" ]; then
-  CLOUD_ENVIRONMENT="azurechinacloud"
+      CLOUD_ENVIRONMENT="azurechinacloud"
 elif [ $domain == "opinsights.azure.us" ]; then
-  CLOUD_ENVIRONMENT="azureusgovernmentcloud"
+      CLOUD_ENVIRONMENT="azureusgovernmentcloud"
 elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then
-  CLOUD_ENVIRONMENT="usnat"
+      CLOUD_ENVIRONMENT="usnat"
 elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then
-  CLOUD_ENVIRONMENT="ussec"
+      CLOUD_ENVIRONMENT="ussec"
 fi
 export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT
-echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc
+echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >>~/.bashrc
 
 #consisten naming conventions with the windows
 export DOMAIN=$domain
-echo "export DOMAIN=$DOMAIN" >> ~/.bashrc
+echo "export DOMAIN=$DOMAIN" >>~/.bashrc
 export WSID=$workspaceId
-echo "export WSID=$WSID" >> ~/.bashrc
+echo "export WSID=$WSID" >>~/.bashrc
 
 # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds)
-if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then  # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1)
+if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1)
       for BACKOFF in {1..4}; do
-            KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL )
+            KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL)
             # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted
             if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then
                   break
             else
-                  sleep $((2**$BACKOFF / 4))  # (exponential backoff)
+                  sleep $((2 ** $BACKOFF / 4)) # (exponential backoff)
             fi
       done
 
       # validate that the retrieved data is an instrumentation key
       if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then
             export APPLICATIONINSIGHTS_AUTH=$(echo $KEY)
-            echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc
+            echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >>~/.bashrc
             echo "Using cloud-specific instrumentation key"
       else
             # no ikey can be retrieved. Disable telemetry and continue
             export DISABLE_TELEMETRY=true
-            echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc
+            echo "export DISABLE_TELEMETRY=true" >>~/.bashrc
             echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry"
       fi
 fi
 
-
 aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode)
 export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey
-echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc
+echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >>~/.bashrc
 
 source ~/.bashrc
 
@@ -314,7 +309,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
       /usr/bin/ruby2.6 tomlparser.rb
 
       cat config_env_var | while read line; do
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source config_env_var
 fi
@@ -326,7 +321,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
 
       cat agent_config_env_var | while read line; do
             #echo $line
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source agent_config_env_var
 
@@ -335,7 +330,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
 
       cat integration_npm_config_env_var | while read line; do
             #echo $line
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source integration_npm_config_env_var
 fi
@@ -352,18 +347,18 @@ fi
 if [ ! -e "/etc/config/kube.conf" ]; then
       if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
             cat defaultpromenvvariables-sidecar | while read line; do
-                  echo $line >> ~/.bashrc
+                  echo $line >>~/.bashrc
             done
             source defaultpromenvvariables-sidecar
       else
             cat defaultpromenvvariables | while read line; do
-                  echo $line >> ~/.bashrc
+                  echo $line >>~/.bashrc
             done
             source defaultpromenvvariables
       fi
 else
       cat defaultpromenvvariables-rs | while read line; do
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source defaultpromenvvariables-rs
 fi
@@ -371,7 +366,7 @@ fi
 #Sourcing telemetry environment variable file if it exists
 if [ -e "telemetry_prom_config_env_var" ]; then
       cat telemetry_prom_config_env_var | while read line; do
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source telemetry_prom_config_env_var
 fi
@@ -384,20 +379,19 @@ if [ ! -e "/etc/config/kube.conf" ]; then
             #Sourcing config environment variable file if it exists
             if [ -e "side_car_fbit_config_env_var" ]; then
                   cat side_car_fbit_config_env_var | while read line; do
-                        echo $line >> ~/.bashrc
+                        echo $line >>~/.bashrc
                   done
                   source side_car_fbit_config_env_var
             fi
       fi
 fi
 
-
 #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting.
 if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
       /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb
 
       cat config_mdm_metrics_env_var | while read line; do
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source config_mdm_metrics_env_var
 
@@ -405,19 +399,19 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
       /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb
 
       cat config_metric_collection_env_var | while read line; do
-            echo $line >> ~/.bashrc
+            echo $line >>~/.bashrc
       done
       source config_metric_collection_env_var
 fi
 
 # OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two)
-if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) ||
-      ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then
+if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) ||
+      ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then
       /usr/bin/ruby2.6 tomlparser-osm-config.rb
 
       if [ -e "integration_osm_config_env_var" ]; then
             cat integration_osm_config_env_var | while read line; do
-                  echo $line >> ~/.bashrc
+                  echo $line >>~/.bashrc
             done
             source integration_osm_config_env_var
       fi
@@ -427,7 +421,7 @@ fi
 echo "Making wget request to cadvisor endpoint with port 10250"
 #Defaults to use port 10255
 cAdvisorIsSecure=false
-RET_CODE=`wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^  HTTP/{print $2}'`
+RET_CODE=$(wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^  HTTP/{print $2}')
 if [ $RET_CODE -eq 200 ]; then
       cAdvisorIsSecure=true
 fi
@@ -439,17 +433,17 @@ export NODE_NAME=""
 if [ "$cAdvisorIsSecure" = true ]; then
       echo "Wget request using port 10250 succeeded. Using 10250"
       export IS_SECURE_CADVISOR_PORT=true
-      echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc
+      echo "export IS_SECURE_CADVISOR_PORT=true" >>~/.bashrc
       export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics"
-      echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc
+      echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >>~/.bashrc
       echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet"
       podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]')
 else
       echo "Wget request using port 10250 failed. Using port 10255"
       export IS_SECURE_CADVISOR_PORT=false
-      echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc
+      echo "export IS_SECURE_CADVISOR_PORT=false" >>~/.bashrc
       export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics"
-      echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc
+      echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >>~/.bashrc
       echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet"
       podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]')
 fi
@@ -461,13 +455,13 @@ if [ ! -z "$podWithValidContainerId" ]; then
       containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]")
       nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]")
       # update runtime only if its not empty, not null and not startswith docker
-      if [ -z "$containerRuntime" -o "$containerRuntime" == null  ]; then
+      if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then
             echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null"
       elif [[ $containerRuntime != docker* ]]; then
             export CONTAINER_RUNTIME=$containerRuntime
       fi
 
-      if [ -z "$nodeName" -o "$nodeName" == null  ]; then
+      if [ -z "$nodeName" -o "$nodeName" == null ]; then
             echo "-e error nodeName in /pods API response is empty"
       else
             export NODE_NAME=$nodeName
@@ -477,31 +471,31 @@ else
 fi
 
 echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME
-echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc
+echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >>~/.bashrc
 
 export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total"
-echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >>~/.bashrc
 export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total"
-echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >>~/.bashrc
 
 # default to docker metrics
 export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations"
 export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors"
 
 if [ "$CONTAINER_RUNTIME" != "docker" ]; then
-   # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18
-   export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations"
-   export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors"
+      # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18
+      export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations"
+      export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors"
 fi
 
 echo "set caps for ruby process to read container env from proc"
 sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6
-echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc
-echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >>~/.bashrc
+echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >>~/.bashrc
 
 source ~/.bashrc
 
-echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname
+echo $NODE_NAME >/var/opt/microsoft/docker-cimprov/state/containerhostname
 #check if file was written successfully.
 cat /var/opt/microsoft/docker-cimprov/state/containerhostname
 
@@ -514,87 +508,120 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}'
 DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}')
 echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION"
 export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION
-echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc
+echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc
 
 #skip imds lookup since not used either legacy or aad msi auth path
 export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true"
-echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc
+echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc
 # this used by mdsd to determine cloud specific LA endpoints
 export OMS_TLD=$domain
-echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc
+echo "export OMS_TLD=$OMS_TLD" >>~/.bashrc
 cat /etc/mdsd.d/envmdsd | while read line; do
-   echo $line >> ~/.bashrc
+      echo $line >>~/.bashrc
 done
 source /etc/mdsd.d/envmdsd
 MDSD_AAD_MSI_AUTH_ARGS=""
 # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH
 export AAD_MSI_AUTH_MODE=false
 if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then
-   echo "*** activating oneagent in aad auth msi mode ***"
-   # msi auth specific args
-   MDSD_AAD_MSI_AUTH_ARGS="-a -A"
-   export AAD_MSI_AUTH_MODE=true
-   echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc
-   # this used by mdsd to determine the cloud specific AMCS endpoints
-   export customEnvironment=$CLOUD_ENVIRONMENT
-   echo "export customEnvironment=$customEnvironment" >> ~/.bashrc
-   export MDSD_FLUENT_SOCKET_PORT="28230"
-   echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc
-   export ENABLE_MCS="true"
-   echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc
-   export MONITORING_USE_GENEVA_CONFIG_SERVICE="false"
-   echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc
-   export MDSD_USE_LOCAL_PERSISTENCY="false"
-   echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc
+      echo "*** activating oneagent in aad auth msi mode ***"
+      # msi auth specific args
+      MDSD_AAD_MSI_AUTH_ARGS="-a -A"
+      export AAD_MSI_AUTH_MODE=true
+      echo "export AAD_MSI_AUTH_MODE=true" >>~/.bashrc
+      # this used by mdsd to determine the cloud specific AMCS endpoints
+      export customEnvironment=$CLOUD_ENVIRONMENT
+      echo "export customEnvironment=$customEnvironment" >>~/.bashrc
+      export MDSD_FLUENT_SOCKET_PORT="28230"
+      echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc
+      export ENABLE_MCS="true"
+      echo "export ENABLE_MCS=$ENABLE_MCS" >>~/.bashrc
+      export MONITORING_USE_GENEVA_CONFIG_SERVICE="false"
+      echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >>~/.bashrc
+      export MDSD_USE_LOCAL_PERSISTENCY="false"
+      echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >>~/.bashrc
 else
-  echo "*** activating oneagent in legacy auth mode ***"
-  CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)"
-  #use the file path as its secure than env
-  CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY"
-  echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id"
-  export CIWORKSPACE_id=$CIWORKSPACE_id
-  echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc
-  export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile
-  echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc
-  export MDSD_FLUENT_SOCKET_PORT="29230"
-  echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc
+      echo "*** activating oneagent in legacy auth mode ***"
+      CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)"
+      #use the file path as its secure than env
+      CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY"
+      echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id"
+      export CIWORKSPACE_id=$CIWORKSPACE_id
+      echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >>~/.bashrc
+      export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile
+      echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >>~/.bashrc
+      export MDSD_FLUENT_SOCKET_PORT="29230"
+      echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc
 fi
 source ~/.bashrc
 
 dpkg -l | grep mdsd | awk '{print $2 " " $3}'
 
 if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
-    echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..."
-    #use tenant name to avoid unix socket conflict and different ports for port conflict
-    #roleprefix to use container specific mdsd socket
-    export TENANT_NAME="${CONTAINER_TYPE}"
-    echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc
-    export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default
-    echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc
-    source ~/.bashrc
-    mkdir /var/run/mdsd-${CONTAINER_TYPE}
-    # add -T 0xFFFF for full traces
-    mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos &
+      echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..."
+      #use tenant name to avoid unix socket conflict and different ports for port conflict
+      #roleprefix to use container specific mdsd socket
+      export TENANT_NAME="${CONTAINER_TYPE}"
+      echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc
+      export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default
+      echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc
+      source ~/.bashrc
+      mkdir /var/run/mdsd-${CONTAINER_TYPE}
+      # add -T 0xFFFF for full traces
+      mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos &
 else
-    echo "starting mdsd mode in main container..."
-    # add -T 0xFFFF for full traces
-    mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>> /dev/null &
+      echo "starting mdsd mode in main container..."
+      # add -T 0xFFFF for full traces
+      mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null &
 fi
 
 # Set up a cron job for logrotation
 if [ ! -f /etc/cron.d/ci-agent ]; then
-    echo "setting up cronjob for ci agent log rotation"
-    echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" > /etc/cron.d/ci-agent
+      echo "setting up cronjob for ci agent log rotation"
+      echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" >/etc/cron.d/ci-agent
 fi
 
 # no dependency on fluentd for prometheus side car container
 if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
       if [ ! -e "/etc/config/kube.conf" ]; then
-         echo "*** starting fluentd v1 in daemonset"
-         fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
+            echo "*** starting fluentd v1 in daemonset"
+            fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
-        echo "*** starting fluentd v1 in replicaset"
-        fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
+            case $NUM_OF_FLUENTD_WORKERS in
+            3)
+                  export NUM_OF_FLUENTD_WORKERS=3
+                  export FLUENTD_POD_INVENTORY_WORKER_ID=2
+                  export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+                  ;;
+            2)
+                 export NUM_OF_FLUENTD_WORKERS=2
+                 export FLUENTD_POD_INVENTORY_WORKER_ID=1
+                 export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+                  ;;
+
+            *)
+                 export NUM_OF_FLUENTD_WORKERS=1
+                 export FLUENTD_POD_INVENTORY_WORKER_ID=0
+                 export FLUENTD_NODE_INVENTORY_WORKER_ID=0
+                 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+                  ;;
+            esac
+            echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
+            echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
+            echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
+            echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
+            source ~/.bashrc
+
+            echo "*** fluentd worker configuration ***"
+            echo "num of workers:${NUM_OF_FLUENTD_WORKERS}"
+            echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
+            echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
+            echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
+
+            echo "*** starting fluentd v1 in replicaset"
+            fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       fi
 fi
 
@@ -621,13 +648,13 @@ if [ ! -e "/etc/config/kube.conf" ]; then
       fi
 else
       if [ -e "/opt/telegraf-test-rs.conf" ]; then
-                  echo "****************Start Telegraf in Test Mode**************************"
-                  /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test
-                  if [ $? -eq 0 ]; then
-                        mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf"
-                        echo "Moving test conf file to telegraf replicaset conf since test run succeeded"
-                  fi
-                  echo "****************End Telegraf Run in Test Mode**************************"
+            echo "****************Start Telegraf in Test Mode**************************"
+            /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test
+            if [ $? -eq 0 ]; then
+                  mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf"
+                  echo "Moving test conf file to telegraf replicaset conf since test run succeeded"
+            fi
+            echo "****************End Telegraf Run in Test Mode**************************"
       fi
 fi
 
@@ -671,15 +698,15 @@ else
 fi
 
 export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id
-echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc
+echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >>~/.bashrc
 export TELEMETRY_AKS_REGION=$telemetry_aks_region
-echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc
+echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >>~/.bashrc
 export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name
-echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc
+echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >>~/.bashrc
 export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name
-echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc
+echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >>~/.bashrc
 export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type
-echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc
+echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >>~/.bashrc
 
 #if [ ! -e "/etc/config/kube.conf" ]; then
 #   nodename=$(cat /hostfs/etc/hostname)
@@ -691,15 +718,15 @@ echo "replacing nodename in telegraf config"
 sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile
 
 export HOST_MOUNT_PREFIX=/hostfs
-echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc
+echo "export HOST_MOUNT_PREFIX=/hostfs" >>~/.bashrc
 export HOST_PROC=/hostfs/proc
-echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc
+echo "export HOST_PROC=/hostfs/proc" >>~/.bashrc
 export HOST_SYS=/hostfs/sys
-echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc
+echo "export HOST_SYS=/hostfs/sys" >>~/.bashrc
 export HOST_ETC=/hostfs/etc
-echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc
+echo "export HOST_ETC=/hostfs/etc" >>~/.bashrc
 export HOST_VAR=/hostfs/var
-echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc
+echo "export HOST_VAR=/hostfs/var" >>~/.bashrc
 
 if [ ! -e "/etc/config/kube.conf" ]; then
       if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
@@ -735,9 +762,10 @@ service rsyslog status
 checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30
 
 shutdown() {
-	 pkill -f mdsd
-	}
+      pkill -f mdsd
+}
 
 trap "shutdown" SIGTERM
 
-sleep inf & wait
+sleep inf &
+wait
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 2ff9c5249..9f9082dd2 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -607,12 +607,14 @@ spec:
           imagePullPolicy: IfNotPresent
           resources:
             limits:
-              cpu: 2
+              cpu: 3
               memory: 1Gi
             requests:
               cpu: 150m
               memory: 250Mi
           env:
+            - name: NUM_OF_FLUENTD_WORKERS
+              value: "3" # This value should be same as number of CPU cores specified under limits
             - name: AKS_RESOURCE_ID
               value: "VALUE_AKS_RESOURCE_ID_VALUE"
             - name: AKS_REGION

From be3436ed9813019400f190ee6ed6f02d250df5db Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 4 Jan 2022 19:11:28 -0800
Subject: [PATCH 04/65] working

---
 build/linux/installer/conf/kube.conf        |   2 +-
 source/plugins/ruby/in_kube_nodes.rb        | 130 ++++++++++----------
 source/plugins/ruby/in_kube_podinventory.rb |   1 -
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index ac9735e20..10a271d99 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -414,4 +414,4 @@
      </buffer>
      retry_mdm_post_wait_minutes 30
     </match>
-  </worker>
\ No newline at end of file
+  </worker>
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 707cfbf9d..332066783 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -302,80 +302,80 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
           # Adding telemetry to send node telemetry every 10 minutes
           timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs
           timeDifferenceInMinutes = timeDifference / 60
-          #if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
-          begin
-            properties = getNodeTelemetryProps(item)
-            properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"]
-            capacityInfo = item["status"]["capacity"]
-
-            ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)
+          if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
             begin
-              if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?)
-                properties["nvigpus"] = capacityInfo["nvidia.com/gpu"]
-              end
+              properties = getNodeTelemetryProps(item)
+              properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"]
+              capacityInfo = item["status"]["capacity"]
+
+              ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)
+              begin
+                if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?)
+                  properties["nvigpus"] = capacityInfo["nvidia.com/gpu"]
+                end
 
-              if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?)
-                properties["amdgpus"] = capacityInfo["amd.com/gpu"]
+                if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?)
+                  properties["amdgpus"] = capacityInfo["amd.com/gpu"]
+                end
+              rescue => errorStr
+                $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}"
+                $log.debug_backtrace(errorStr.backtrace)
+                ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
               end
-            rescue => errorStr
-              $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}"
-              $log.debug_backtrace(errorStr.backtrace)
-              ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
-            end
 
-            # Telemetry for data collection config for replicaset
-            if (File.file?(@@configMapMountPath))
-              properties["collectAllKubeEvents"] = @@collectAllKubeEvents
-            end
+              # Telemetry for data collection config for replicaset
+              if (File.file?(@@configMapMountPath))
+                properties["collectAllKubeEvents"] = @@collectAllKubeEvents
+              end
 
-            #telemetry about prometheus metric collections settings for replicaset
-            if (File.file?(@@promConfigMountPath))
-              properties["rsPromInt"] = @@rsPromInterval
-              properties["rsPromFPC"] = @@rsPromFieldPassCount
-              properties["rsPromFDC"] = @@rsPromFieldDropCount
-              properties["rsPromServ"] = @@rsPromK8sServiceCount
-              properties["rsPromUrl"] = @@rsPromUrlCount
-              properties["rsPromMonPods"] = @@rsPromMonitorPods
-              properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
-              properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
-              properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
-            end
-            # telemetry about osm metric settings for replicaset
-            if (File.file?(@@osmConfigMountPath))
-              properties["osmNamespaceCount"] = @@osmNamespaceCount
-            end
-            ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
-            telemetrySent = true
+              #telemetry about prometheus metric collections settings for replicaset
+              if (File.file?(@@promConfigMountPath))
+                properties["rsPromInt"] = @@rsPromInterval
+                properties["rsPromFPC"] = @@rsPromFieldPassCount
+                properties["rsPromFDC"] = @@rsPromFieldDropCount
+                properties["rsPromServ"] = @@rsPromK8sServiceCount
+                properties["rsPromUrl"] = @@rsPromUrlCount
+                properties["rsPromMonPods"] = @@rsPromMonitorPods
+                properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
+                properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
+                properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
+              end
+              # telemetry about osm metric settings for replicaset
+              if (File.file?(@@osmConfigMountPath))
+                properties["osmNamespaceCount"] = @@osmNamespaceCount
+              end
+              ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
+              telemetrySent = true
 
-            # Telemetry for data collection config for replicaset
-            if (File.file?(@@configMapMountPath))
-              properties["collectAllKubeEvents"] = @@collectAllKubeEvents
-            end
+              # Telemetry for data collection config for replicaset
+              if (File.file?(@@configMapMountPath))
+                properties["collectAllKubeEvents"] = @@collectAllKubeEvents
+              end
 
-            #telemetry about prometheus metric collections settings for replicaset
-            if (File.file?(@@promConfigMountPath))
-              properties["rsPromInt"] = @@rsPromInterval
-              properties["rsPromFPC"] = @@rsPromFieldPassCount
-              properties["rsPromFDC"] = @@rsPromFieldDropCount
-              properties["rsPromServ"] = @@rsPromK8sServiceCount
-              properties["rsPromUrl"] = @@rsPromUrlCount
-              properties["rsPromMonPods"] = @@rsPromMonitorPods
-              properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
-              properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
-              properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
-            end
-            # telemetry about osm metric settings for replicaset
-            if (File.file?(@@osmConfigMountPath))
-              properties["osmNamespaceCount"] = @@osmNamespaceCount
+              #telemetry about prometheus metric collections settings for replicaset
+              if (File.file?(@@promConfigMountPath))
+                properties["rsPromInt"] = @@rsPromInterval
+                properties["rsPromFPC"] = @@rsPromFieldPassCount
+                properties["rsPromFDC"] = @@rsPromFieldDropCount
+                properties["rsPromServ"] = @@rsPromK8sServiceCount
+                properties["rsPromUrl"] = @@rsPromUrlCount
+                properties["rsPromMonPods"] = @@rsPromMonitorPods
+                properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength
+                properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength
+                properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength
+              end
+              # telemetry about osm metric settings for replicaset
+              if (File.file?(@@osmConfigMountPath))
+                properties["osmNamespaceCount"] = @@osmNamespaceCount
+              end
+              @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
+              telemetrySent = true
+            rescue => errorStr
+              $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}"
+              $log.debug_backtrace(errorStr.backtrace)
+              @applicationInsightsUtility.sendExceptionTelemetry(errorStr)
             end
-            @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
-            telemetrySent = true
-          rescue => errorStr
-            $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}"
-            $log.debug_backtrace(errorStr.backtrace)
-            @applicationInsightsUtility.sendExceptionTelemetry(errorStr)
           end
-          #end
         end
         if telemetrySent == true
           @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 1ed91d9cf..0ae02eea7 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -184,7 +184,6 @@ def enumerate(podList = nil)
         end
 
         # Flush AppInsights telemetry once all the processing is done
-        telemetryFlush = true
         if telemetryFlush == true
           telemetryProperties = {}
           telemetryProperties["Computer"] = @@hostName

From c8ca6e56281bcb865817c760ebd4025ab1d66905 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 4 Jan 2022 23:14:20 -0800
Subject: [PATCH 05/65] fix log lines

---
 source/plugins/ruby/in_kube_nodes.rb        |  6 +++---
 source/plugins/ruby/in_kube_podinventory.rb | 14 ++++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 332066783..4708aed64 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -573,14 +573,14 @@ def watch_nodes
               @nodeItemsCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_nodes::watch_nodes : Getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
             resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
             continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-            $log.info("in_kube_nodes::watch_nodes : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
             if (!nodeInventory.nil? && !nodeInventory.empty?)
               nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
               if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                 nodeInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
                   nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 0ae02eea7..0461ad211 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -675,19 +675,20 @@ def watch_pods
             winNodes = KubernetesApiClient.getWindowsNodes()
             isCheckedWindowsNodes = true
           end
+          $log.info("in_kube_podinventory::watch_pods:number of windows nodes: #{winNodes.length} @ #{Time.now.utc.iso8601}")
           if podsResourceVersion.nil?
             # clear cache before filling the cache with list
             @podCacheMutex.synchronize {
               @podItemsCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_podinventory::watch_pods : Getting pods from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
             continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
-            $log.info("in_kube_podinventory::watch_pods : Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
             if (!podInventory.nil? && !podInventory.empty?)
               podsResourceVersion = podInventory["metadata"]["resourceVersion"]
               if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                 podInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
                   podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
@@ -704,7 +705,7 @@ def watch_pods
               if (!podInventory.nil? && !podInventory.empty?)
                 podsResourceVersion = podInventory["metadata"]["resourceVersion"]
                 if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
@@ -732,9 +733,9 @@ def watch_pods
                    !item["metadata"].nil? && !item["metadata"].empty? &&
                    !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                   podsResourceVersion = item["metadata"]["resourceVersion"]
-                  $log.info("in_kube_podinventory::watch_pods: received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
                 else
-                  $log.info("in_kube_podinventory::watch_pods: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                   podsResourceVersion = nil
                   # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                   break
@@ -759,6 +760,7 @@ def watch_pods
                 $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
               end
             end
+            $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
           end
         rescue Net::ReadTimeout => errorStr
           $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")

From 6bee9547a0f7c365fb4735b073060fed74ee006b Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 5 Jan 2022 21:55:57 -0800
Subject: [PATCH 06/65] refactor code

---
 kubernetes/omsagent.yaml                    |   2 +-
 source/plugins/ruby/KubernetesApiClient.rb  | 154 ++++++------
 source/plugins/ruby/WatchStream.rb          |  25 +-
 source/plugins/ruby/in_kube_nodes.rb        | 131 ++++++----
 source/plugins/ruby/in_kube_podinventory.rb | 256 ++++++++++++--------
 5 files changed, 339 insertions(+), 229 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 9f9082dd2..95f9cf636 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -608,7 +608,7 @@ spec:
           resources:
             limits:
               cpu: 3
-              memory: 1Gi
+              memory: 1.5Gi
             requests:
               cpu: 150m
               memory: 250Mi
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 40f80886a..dedf3c653 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -867,7 +867,7 @@ def watch(resource_name, options = {})
           watcher.finish if watcher
         end
       rescue => errorStr
-        @Log.warn "KubernetesApiClient::watch:Failed with an error : #{errorStr}"
+        @Log.warn "KubernetesApiClient::watch:Failed with an error: #{errorStr}"
       end
     end
 
@@ -890,25 +890,29 @@ def getOptimizedItem(resource, resourceItem, winNodes = [])
 
     def getServiceOptimizedItem(resourceItem)
       item = {}
-      item["metadata"] = {}
-      if !resourceItem["metadata"].nil?
-        item["metadata"]["name"] = resourceItem["metadata"]["name"]
-        item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
-      end
-      item["spec"] = {}
-      if !resourceItem["spec"].nil?
-        item["spec"]["selector"] = []
-        if !resourceItem["spec"]["selector"].nil?
-          item["spec"]["selector"] = resourceItem["spec"]["selector"]
-        end
-        item["spec"]["clusterIP"] = ""
-        if !resourceItem["spec"]["clusterIP"].nil?
-          item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"]
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
         end
-        item["spec"]["type"] = ""
-        if !resourceItem["spec"]["type"].nil?
-          item["spec"]["type"] = resourceItem["spec"]["type"]
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          item["spec"]["selector"] = []
+          if !resourceItem["spec"]["selector"].nil?
+            item["spec"]["selector"] = resourceItem["spec"]["selector"]
+          end
+          item["spec"]["clusterIP"] = ""
+          if !resourceItem["spec"]["clusterIP"].nil?
+            item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"]
+          end
+          item["spec"]["type"] = ""
+          if !resourceItem["spec"]["type"].nil?
+            item["spec"]["type"] = resourceItem["spec"]["type"]
+          end
         end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getServiceOptimizedItem:Failed with an error : #{errorStr}"
       end
       return item
     end
@@ -1102,71 +1106,79 @@ def getNodeOptimizedItem(resourceItem)
 
     def getDeploymentOptimizedItem(resourceItem)
       item = {}
-      item["metadata"] = {}
-      if !resourceItem["metadata"].nil?
-        item["metadata"]["name"] = resourceItem["metadata"]["name"]
-        item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
-      end
-      item["spec"] = {}
-      if !resourceItem["spec"].nil?
-        item["spec"]["strategy"] = {}
-        if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil?
-          item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"]
-        end
-        if !resourceItem["spec"]["replicas"].nil?
-          item["spec"]["replicas"] = resourceItem["spec"]["replicas"]
-        end
-      end
-      item["status"] = {}
-      if !resourceItem["status"].nil?
-        if !resourceItem["status"]["readyReplicas"].nil?
-          item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"]
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
         end
-        if !resourceItem["status"]["updatedReplicas"].nil?
-          item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"]
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          item["spec"]["strategy"] = {}
+          if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil?
+            item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"]
+          end
+          if !resourceItem["spec"]["replicas"].nil?
+            item["spec"]["replicas"] = resourceItem["spec"]["replicas"]
+          end
         end
-        if !resourceItem["status"]["availableReplicas"].nil?
-          item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"]
+        item["status"] = {}
+        if !resourceItem["status"].nil?
+          if !resourceItem["status"]["readyReplicas"].nil?
+            item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"]
+          end
+          if !resourceItem["status"]["updatedReplicas"].nil?
+            item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"]
+          end
+          if !resourceItem["status"]["availableReplicas"].nil?
+            item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"]
+          end
         end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getDeploymentOptimizedItem:Failed with an error : #{errorStr}"
       end
       return item
     end
 
     def getHpaOptimizedItem(resourceItem)
       item = {}
-      item["metadata"] = {}
-      if !resourceItem["metadata"].nil?
-        item["metadata"]["name"] = resourceItem["metadata"]["name"]
-        item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
-        item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
-      end
-      item["spec"] = {}
-      if !resourceItem["spec"].nil?
-        if !resourceItem["spec"]["minReplicas"].nil?
-          item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"]
-        end
-        if !resourceItem["spec"]["maxReplicas"].nil?
-          item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"]
-        end
-        item["spec"]["scaleTargetRef"] = {}
-        if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil?
-          item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"]
-        end
-        if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil?
-          item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"]
-        end
-      end
-      item["status"] = {}
-      if !resourceItem["status"].nil?
-        if !resourceItem["status"]["currentReplicas"].nil?
-          item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"]
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+          item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
         end
-        if !resourceItem["status"]["desiredReplicas"].nil?
-          item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"]
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          if !resourceItem["spec"]["minReplicas"].nil?
+            item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"]
+          end
+          if !resourceItem["spec"]["maxReplicas"].nil?
+            item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"]
+          end
+          item["spec"]["scaleTargetRef"] = {}
+          if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil?
+            item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"]
+          end
+          if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil?
+            item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"]
+          end
         end
-        if !resourceItem["status"]["lastScaleTime"].nil?
-          item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"]
+        item["status"] = {}
+        if !resourceItem["status"].nil?
+          if !resourceItem["status"]["currentReplicas"].nil?
+            item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"]
+          end
+          if !resourceItem["status"]["desiredReplicas"].nil?
+            item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"]
+          end
+          if !resourceItem["status"]["lastScaleTime"].nil?
+            item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"]
+          end
         end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getHpaOptimizedItem:Failed with an error : #{errorStr}"
       end
       return item
     end
diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb
index 6633d26d5..6cc850450 100644
--- a/source/plugins/ruby/WatchStream.rb
+++ b/source/plugins/ruby/WatchStream.rb
@@ -23,22 +23,29 @@ def initialize(uri, http_options, http_headers, logger)
     @http_options = http_options
     @http_headers = http_headers
     @logger = logger
-    @logger.info "WatchStream:initialize @ #{Time.now.utc.iso8601}"
+    @path = ""
+    @logger.info "WatchStream::initialize @ #{Time.now.utc.iso8601}"
   end
 
   def each
     @finished = false
     buffer = +""
-    @logger.info "WatchStream: Opening TCP session  @ #{Time.now.utc.iso8601}"
+    @logger.info "WatchStream::each:Opening TCP session  @ #{Time.now.utc.iso8601}"
     @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options)
-    path = @uri.path
+    if @http_client.nil?
+      raise "WatchStream::each:Failed to create HTTPClient object @ #{Time.now.utc.iso8601}"
+    end
+    @path = @uri.path
+    if @path.nil? || @path.empty?
+      raise "WatchStream::each:URI path should not be empty or nil @ #{Time.now.utc.iso8601}"
+    end
     if !@uri.query.nil? && !@uri.query.empty?
-      path += "?" + @uri.query
+      @path += "?" + @uri.query
     end
-    @logger.info "WatchStream: Making GET API call for Watch with path: #{path}  @ #{Time.now.utc.iso8601}"
-    @http_client.request_get(path, @http_headers) do |response|
+    @logger.info "WatchStream::each:Making GET API call for Watch with path: #{@path}  @ #{Time.now.utc.iso8601}"
+    @http_client.request_get(@path, @http_headers) do |response|
       if !response.nil? && response.code.to_i > 300
-        raise "WatchStream: watch connection failed with an http status code: #{response.code}"
+        raise "WatchStream::each:Watch connection of the path: #{@path} failed with an http status code: #{response.code} @ #{Time.now.utc.iso8601}"
       end
       response.read_body do |chunk|
         buffer << chunk
@@ -54,10 +61,10 @@ def each
   def finish
     begin
       @finished = true
-      @logger.info "WatchStream:finish HTTP session @ #{Time.now.utc.iso8601}"
+      @logger.info "WatchStream::finish:Closing HTTP session of the path:#{@path} @ #{Time.now.utc.iso8601}"
       @http_client.finish if !@http_client.nil? && @http_client.started?
     rescue => error
-      @logger.warn "WatchStream:finish failed with an error: #{error} @ #{Time.now.utc.iso8601}"
+      @logger.warn "WatchStream::finish:Closing of HTTP session of the path: #{@path} failed with an error: #{error} @ #{Time.now.utc.iso8601}"
     end
   end
 end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 4708aed64..3c49ebabf 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -564,6 +564,7 @@ def getNodeTelemetryProps(item)
     end
 
     def watch_nodes
+      $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}")
       nodesResourceVersion = nil
       loop do
         begin
@@ -583,10 +584,18 @@ def watch_nodes
                 $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                 nodeInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
-                  nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                  @nodeCacheMutex.synchronize {
-                    @nodeItemsCache[key] = nodeItem
-                  }
+                  if !key.nil? && !key.empty?
+                    nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                    if !nodeItem.nil? && !nodeItem.empty?
+                      @nodeCacheMutex.synchronize {
+                        @nodeItemsCache[key] = nodeItem
+                      }
+                    else
+                      $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                    end
+                  else
+                    $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                  end
                 end
               end
             else
@@ -600,10 +609,18 @@ def watch_nodes
                   $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
                   nodeInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
-                    nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                    @nodeCacheMutex.synchronize {
-                      @nodeItemsCache[key] = nodeItem
-                    }
+                    if !key.nil? && !key.empty?
+                      nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                      if !nodeItem.nil? && !nodeItem.empty?
+                        @nodeCacheMutex.synchronize {
+                          @nodeItemsCache[key] = nodeItem
+                        }
+                      else
+                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                    end
                   end
                 end
               else
@@ -611,58 +628,74 @@ def watch_nodes
               end
             end
           end
-          $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-          watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
-          if watcher.nil?
-            $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-          else
-            watcher.each do |notice|
-              case notice["type"]
-              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                item = notice["object"]
-                # extract latest resource version to use for watch reconnect
-                if !item.nil? && !item.empty? &&
-                   !item["metadata"].nil? && !item["metadata"].empty? &&
-                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                  nodesResourceVersion = item["metadata"]["resourceVersion"]
-                  $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-                else
-                  $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+          begin
+            $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    nodesResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    nodesResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                      if !nodeItem.nil? && !nodeItem.empty?
+                        @nodeCacheMutex.synchronize {
+                          @nodeItemsCache[key] = nodeItem
+                        }
+                      else
+                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      @nodeCacheMutex.synchronize {
+                        @nodeItemsCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
                   nodesResourceVersion = nil
-                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
                   break
+                else
+                  $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
-                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                  key = item["metadata"]["uid"]
-                  nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                  @nodeCacheMutex.synchronize {
-                    @nodeItemsCache[key] = nodeItem
-                  }
-                elsif notice["type"] == "DELETED"
-                  key = item["metadata"]["uid"]
-                  @nodeCacheMutex.synchronize {
-                    @nodeItemsCache.delete(key)
-                  }
-                end
-              when "ERROR"
-                nodesResourceVersion = nil
-                $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                break
-              else
-                $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
               end
             end
+          rescue Net::ReadTimeout => errorStr
+            $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
           end
-        rescue Net::ReadTimeout => errorStr
-          $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
         rescue => errorStr
           $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           nodesResourceVersion = nil
-          sleep(5) # do not overwhelm the api-server if api-server broken
-        ensure
-          watcher.finish if watcher
         end
       end
+      $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}")
     end
   end # Kube_Node_Input
 
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 0461ad211..dde92236d 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -665,6 +665,7 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords)
     end
 
     def watch_pods
+      $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
       podsResourceVersion = nil
       isCheckedWindowsNodes = false
       loop do
@@ -691,10 +692,18 @@ def watch_pods
                 $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                 podInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
-                  podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
-                  @podCacheMutex.synchronize {
-                    @podItemsCache[key] = podItem
-                  }
+                  if !key.nil? && !key.empty?
+                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                    if !podItem.nil? && !podItem.empty?
+                      @podCacheMutex.synchronize {
+                        @podItemsCache[key] = podItem
+                      }
+                    else
+                      $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil"
+                    end
+                  else
+                    $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                  end
                 end
               end
             else
@@ -708,10 +717,18 @@ def watch_pods
                   $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
-                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
-                    @podCacheMutex.synchronize {
-                      @podItemsCache[key] = podItem
-                    }
+                    if !key.nil? && !key.empty?
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                      if !podItem.nil? && !podItem.empty?
+                        @podCacheMutex.synchronize {
+                          @podItemsCache[key] = podItem
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                    end
                   end
                 end
               else
@@ -719,62 +736,80 @@ def watch_pods
               end
             end
           end
-          $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-          watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
-          if watcher.nil?
-            $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-          else
-            watcher.each do |notice|
-              case notice["type"]
-              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                item = notice["object"]
-                # extract latest resource version to use for watch reconnect
-                if !item.nil? && !item.empty? &&
-                   !item["metadata"].nil? && !item["metadata"].empty? &&
-                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                  podsResourceVersion = item["metadata"]["resourceVersion"]
-                  $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-                else
-                  $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+          begin
+            $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    podsResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    podsResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                      if !podItem.nil? && !podItem.empty?
+                        @podCacheMutex.synchronize {
+                          @podItemsCache[key] = podItem
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      @podCacheMutex.synchronize {
+                        @podItemsCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
                   podsResourceVersion = nil
-                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
                   break
+                else
+                  $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
-                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                  key = item["metadata"]["uid"]
-                  podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
-                  @podCacheMutex.synchronize {
-                    @podItemsCache[key] = podItem
-                  }
-                elsif notice["type"] == "DELETED"
-                  key = item["metadata"]["uid"]
-                  @podCacheMutex.synchronize {
-                    @podItemsCache.delete(key)
-                  }
-                end
-              when "ERROR"
-                podsResourceVersion = nil
-                $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                break
-              else
-                $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
               end
+              $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
             end
-            $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+          rescue Net::ReadTimeout => errorStr
+            ## This expected if there is no activity more than readtimeout value used in the connection
+            $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            podsResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
           end
-        rescue Net::ReadTimeout => errorStr
-          $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
         rescue => errorStr
           $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           podsResourceVersion = nil
-          sleep(5) # do not overwhelm the api-server if api-server broken
-        ensure
-          watcher.finish if watcher
         end
       end
+      $log.info("in_kube_podinventory::watch_pods:End @ #{Time.now.utc.iso8601}")
     end
 
     def watch_services
+      $log.info("in_kube_podinventory::watch_services:Start @ #{Time.now.utc.iso8601}")
       servicesResourceVersion = nil
       loop do
         begin
@@ -783,9 +818,9 @@ def watch_services
             @serviceCacheMutex.synchronize {
               @serviceItemsCache.clear()
             }
-            $log.info("in_kube_podinventory::watch_services : Getting services from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}")
             serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
-            $log.info("in_kube_podinventory::watch_services : Done getting services from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}")
             if !serviceInfo.nil?
               $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
               serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
@@ -794,13 +829,21 @@ def watch_services
               if (!serviceInventory.nil? && !serviceInventory.empty?)
                 servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"]
                 if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_services : number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
                   serviceInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
-                    serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
-                    @serviceCacheMutex.synchronize {
-                      @serviceItemsCache[key] = serviceItem
-                    }
+                    if !key.nil? && !key.empty?
+                      serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                      if !serviceItem.nil? && !serviceItem.empty?
+                        @serviceCacheMutex.synchronize {
+                          @serviceItemsCache[key] = serviceItem
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty"
+                    end
                   end
                 end
               else
@@ -809,59 +852,74 @@ def watch_services
               serviceInventory = nil
             end
           end
-
-          $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-          watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
-          if watcher.nil?
-            $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-          else
-            watcher.each do |notice|
-              case notice["type"]
-              when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                item = notice["object"]
-                # extract latest resource version to use for watch reconnect
-                if !item.nil? && !item.empty? &&
-                   !item["metadata"].nil? && !item["metadata"].empty? &&
-                   !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                  servicesResourceVersion = item["metadata"]["resourceVersion"]
-                  $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-                else
-                  $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+          begin
+            $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    servicesResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    servicesResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                      if !serviceItem.nil? && !serviceItem.empty?
+                        @serviceCacheMutex.synchronize {
+                          @serviceItemsCache[key] = serviceItem
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      @serviceCacheMutex.synchronize {
+                        @serviceItemsCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
                   servicesResourceVersion = nil
-                  # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                  $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
                   break
+                else
+                  $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
-                if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                  key = item["metadata"]["uid"]
-                  serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
-                  @serviceCacheMutex.synchronize {
-                    @serviceItemsCache[key] = serviceItem
-                  }
-                elsif notice["type"] == "DELETED"
-                  key = item["metadata"]["uid"]
-                  @serviceCacheMutex.synchronize {
-                    @serviceItemsCache.delete(key)
-                  }
-                end
-              when "ERROR"
-                servicesResourceVersion = nil
-                $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                break
-              else
-                $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
               end
             end
+          rescue Net::ReadTimeout => errorStr
+            $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            servicesResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
           end
-        rescue Net::ReadTimeout => errorStr
-          $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
         rescue => errorStr
           $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           servicesResourceVersion = nil
-          sleep(5) # do not overwhelm the api-server if api-server broken
-        ensure
-          watcher.finish if watcher
         end
       end
+      $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}")
     end
   end # Kube_Pod_Input
 end # module

From 0593d020bf989f32a607d91c3284e95aded9a56b Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 7 Jan 2022 10:50:34 -0800
Subject: [PATCH 07/65] cache telemetry

---
 kubernetes/omsagent.yaml                    |  2 ++
 source/plugins/ruby/KubernetesApiClient.rb  |  8 ++++++++
 source/plugins/ruby/in_kube_nodes.rb        | 15 ++++++++++-----
 source/plugins/ruby/in_kube_podinventory.rb | 15 ++++++++++++---
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 95f9cf636..d5545f041 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -392,6 +392,8 @@ spec:
             # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests
             - name: ISTEST
               value: "true"
+            - name: EMIT_CACHE_TELEMETRY
+              value: "false"
             #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters
             #- name: ACS_RESOURCE_NAME
             #  value: "my_acs_cluster_name"
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index dedf3c653..003dab9cf 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -1182,5 +1182,13 @@ def getHpaOptimizedItem(resourceItem)
       end
       return item
     end
+
+    def isEmitCacheTelemetry
+      isEmitCacheTelemtryEnabled = false
+      if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true"
+        isEmitCacheTelemtryEnabled = true
+      end
+      return isEmitCacheTelemtryEnabled
+    end
   end
 end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 3c49ebabf..2d3417622 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -65,7 +65,6 @@ def initialize(kubernetesApiClient = nil,
       @NodeCache = NodeStatsCache.new()
       @watchNodesThread = nil
       @nodeItemsCache = {}
-      #@nodeItemsCacheSizeKB = 0
     end
 
     config_param :run_interval, :time, :default => 60
@@ -153,9 +152,12 @@ def enumerate
         # Initializing continuation token to nil
         continuationToken = nil
         nodeInventory = {}
+        nodeItemsCacheSizeKB = 0
         @nodeCacheMutex.synchronize {
           nodeInventory["items"] = @nodeItemsCache.values.clone
-          #@nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
+          end
         }
         nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
@@ -169,8 +171,12 @@ def enumerate
         timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs
         timeDifferenceInMinutes = timeDifference / 60
         if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
-          @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {})
-          @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {})
+          telemetryProperties = {}
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB
+          end
+          @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, telemetryProperties)
+          @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, telemetryProperties)
           @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
         end
         # Setting this to nil so that we dont hold memory until GC kicks in
@@ -556,7 +562,6 @@ def getNodeTelemetryProps(item)
         end
         properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE
         properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE
-        #properties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB
       rescue => errorStr
         $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}"
       end
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index dde92236d..7ed5e29cf 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -144,8 +144,12 @@ def enumerate(podList = nil)
         end
 
         serviceInventory = {}
+        serviceItemsCacheSizeKB = 0
         @serviceCacheMutex.synchronize {
           serviceInventory["items"] = @serviceItemsCache.values.clone
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            serviceItemsCacheSizeKB = @serviceItemsCache.to_s.length / 1024
+          end
         }
         serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime)
         # updating for telemetry
@@ -157,11 +161,13 @@ def enumerate(podList = nil)
         podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
         # Initializing continuation token to nil
         continuationToken = nil
-        #podItemsCacheSizeKB = 0
+        podItemsCacheSizeKB = 0
         podInventory = {}
         @podCacheMutex.synchronize {
           podInventory["items"] = @podItemsCache.values.clone
-          #podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024
+          end
         }
         podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
@@ -189,7 +195,10 @@ def enumerate(podList = nil)
           telemetryProperties["Computer"] = @@hostName
           telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE
           telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE
-          #telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
+            telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
+          end
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
           ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
           ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {})

From 3f11a273f9473baa97731c7176a248ba129ae134 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 7 Jan 2022 18:06:21 -0800
Subject: [PATCH 08/65] nodecount telemetry

---
 source/plugins/ruby/in_kube_nodes.rb | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 2d3417622..997167780 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -153,6 +153,7 @@ def enumerate
         continuationToken = nil
         nodeInventory = {}
         nodeItemsCacheSizeKB = 0
+        nodeCount = 0
         @nodeCacheMutex.synchronize {
           nodeInventory["items"] = @nodeItemsCache.values.clone
           if KubernetesApiClient.isEmitCacheTelemetry()
@@ -162,7 +163,8 @@ def enumerate
         nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
         if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-          $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+          nodeCount = nodeInventory["items"].length
+          $log.info("in_kube_nodes::enumerate : number of node items :#{nodeCount} from Kube API @ #{Time.now.utc.iso8601}")
           parse_and_emit_records(nodeInventory, batchTime)
         else
           $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory"
@@ -171,12 +173,13 @@ def enumerate
         timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs
         timeDifferenceInMinutes = timeDifference / 60
         if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES)
+          @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {})
+          @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {})
           telemetryProperties = {}
           if KubernetesApiClient.isEmitCacheTelemetry()
             telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB
           end
-          @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, telemetryProperties)
-          @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, telemetryProperties)
+          ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties)
           @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
         end
         # Setting this to nil so that we dont hold memory until GC kicks in

From 3752459a65d2c9392609dbb4ba8a30cdf8f779bb Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 8 Jan 2022 11:26:32 -0800
Subject: [PATCH 09/65] bug fix

---
 source/plugins/ruby/KubernetesApiClient.rb  | 27 ++++++++++++---------
 source/plugins/ruby/in_kube_podinventory.rb | 16 ++++--------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 003dab9cf..348d2e7ba 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -871,10 +871,10 @@ def watch(resource_name, options = {})
       end
     end
 
-    def getOptimizedItem(resource, resourceItem, winNodes = [])
+    def getOptimizedItem(resource, resourceItem)
       case resource
       when "pods"
-        return getPodOptimizedItem(resourceItem, winNodes)
+        return getPodOptimizedItem(resourceItem)
       when "nodes"
         return getNodeOptimizedItem(resourceItem)
       when "services"
@@ -917,18 +917,23 @@ def getServiceOptimizedItem(resourceItem)
       return item
     end
 
-    def isWindowsPodItem(podItem, winNodes)
+    def isWindowsPodItem(podItem)
       isWindowsPod = false
-      if !winNodes.nil? && !winNodes.empty?
-        nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
-        if !nodeName.empty? && winNodes.include?(nodeName)
-          isWindowsPod = true
+      begin
+        winNodes = KubernetesApiClient.getWindowsNodesArray()
+        if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0
+          nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
+          if !nodeName.empty? && winNodes.include?(nodeName)
+            isWindowsPod = true
+          end
         end
+      rescue => errorStr
+        $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
       end
       return isWindowsPod
     end
 
-    def getPodOptimizedItem(resourceItem, winNodes)
+    def getPodOptimizedItem(resourceItem)
       item = {}
       begin
         item["metadata"] = {}
@@ -951,7 +956,7 @@ def getPodOptimizedItem(resourceItem, winNodes)
             item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"]
           end
         end
-        isWindowsPod = isWindowsPodItem(resourceItem, winNodes)
+        isWindowsPod = isWindowsPodItem(resourceItem)
         item["spec"] = {}
         if !resourceItem["spec"].nil?
           item["spec"]["containers"] = []
@@ -970,7 +975,7 @@ def getPodOptimizedItem(resourceItem, winNodes)
                 currentContainer["image"] = container["image"]
                 currentContainer["ports"] = container["ports"]
                 currentContainer["command"] = container["command"]
-                currentContainer["EnvironmentVar"] = ""
+                currentContainer["env"] = ""
                 if !isDisableClusterCollectEnvVar
                   currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
                 end
@@ -989,7 +994,7 @@ def getPodOptimizedItem(resourceItem, winNodes)
                 currentContainer["image"] = container["image"]
                 currentContainer["ports"] = container["ports"]
                 currentContainer["command"] = container["command"]
-                currentContainer["EnvironmentVar"] = ""
+                currentContainer["env"] = ""
                 if !isDisableClusterCollectEnvVar
                   currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
                 end
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 7ed5e29cf..4103fcd33 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -676,16 +676,10 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords)
     def watch_pods
       $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
       podsResourceVersion = nil
-      isCheckedWindowsNodes = false
+      # invoke getWindowsNodes to get windowsnodearray cache populated
+      KubernetesApiClient.getWindowsNodes()
       loop do
         begin
-          # check if the cluster has windows nodes since windows container records requires inventory specific fields
-          winNodes = KubernetesApiClient.getWindowsNodesArray()
-          if !isCheckedWindowsNodes && winNodes.empty?
-            winNodes = KubernetesApiClient.getWindowsNodes()
-            isCheckedWindowsNodes = true
-          end
-          $log.info("in_kube_podinventory::watch_pods:number of windows nodes: #{winNodes.length} @ #{Time.now.utc.iso8601}")
           if podsResourceVersion.nil?
             # clear cache before filling the cache with list
             @podCacheMutex.synchronize {
@@ -702,7 +696,7 @@ def watch_pods
                 podInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
                   if !key.nil? && !key.empty?
-                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                    podItem = KubernetesApiClient.getOptimizedItem("pods", item)
                     if !podItem.nil? && !podItem.empty?
                       @podCacheMutex.synchronize {
                         @podItemsCache[key] = podItem
@@ -727,7 +721,7 @@ def watch_pods
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem
@@ -770,7 +764,7 @@ def watch_pods
                   if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes)
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem

From 694bbc0c5f98efd7c02a2bf37b76d84f4be0a2d1 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 8 Jan 2022 21:58:46 -0800
Subject: [PATCH 10/65] further optimize

---
 source/plugins/ruby/KubernetesApiClient.rb | 55 +++++++++++++++++-----
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 348d2e7ba..1bfa780d9 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -939,17 +939,21 @@ def getPodOptimizedItem(resourceItem)
         item["metadata"] = {}
         if !resourceItem["metadata"].nil?
           if !resourceItem["metadata"]["annotations"].nil?
-            item["metadata"]["annotations"] = resourceItem["metadata"]["annotations"]
+            item["metadata"]["annotations"] = {}
+            item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"]
           end
           if !resourceItem["metadata"]["labels"].nil?
             item["metadata"]["labels"] = resourceItem["metadata"]["labels"]
           end
-          if !resourceItem["metadata"]["ownerReferences"].nil?
-            item["metadata"]["ownerReferences"] = resourceItem["metadata"]["ownerReferences"]
+          if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0
+            item["metadata"]["ownerReferences"] = []
+            ownerReference = {}
+            ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"]
+            ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"]
+            item["metadata"]["ownerReferences"].push(ownerReference)
           end
           item["metadata"]["name"] = resourceItem["metadata"]["name"]
           item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
-          item["metadata"]["resourceVersion"] = resourceItem["metadata"]["resourceVersion"]
           item["metadata"]["uid"] = resourceItem["metadata"]["uid"]
           item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"]
           if !resourceItem["metadata"]["deletionTimestamp"].nil?
@@ -1021,14 +1025,12 @@ def getPodOptimizedItem(resourceItem)
           if !resourceItem["status"]["phase"].nil?
             item["status"]["phase"] = resourceItem["status"]["phase"]
           end
-          item["status"]["conditions"] = []
           if !resourceItem["status"]["conditions"].nil?
+            item["status"]["conditions"] = []
             resourceItem["status"]["conditions"].each do |condition|
               currentCondition = {}
               currentCondition["type"] = condition["type"]
               currentCondition["status"] = condition["status"]
-              ## TODO - check if we need this
-              currentCondition["lastTransitionTime"] = condition["lastTransitionTime"]
               item["status"]["conditions"].push(currentCondition)
             end
           end
@@ -1083,12 +1085,25 @@ def getNodeOptimizedItem(resourceItem)
         item["spec"] = {}
         if !resourceItem["spec"].nil?
           if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty?
-            item["spec"]["providerID"] = resourceItem["spec"]["providerID"]
+            provider = resourceItem["spec"]["providerID"].split(":")[0]
+            if !provider.nil? && !provider.empty?
+              item["spec"]["providerID"] = provider
+            end
           end
         end
         item["status"] = {}
         if !resourceItem["status"].nil?
-          item["status"]["conditions"] = resourceItem["status"]["conditions"]
+          item["status"]["conditions"] = []
+          if !resourceItem["status"]["conditions"].nil?
+            resourceItem["status"]["conditions"].each do |condition|
+              currentCondition = {}
+              currentCondition["type"] = condition["type"]
+              currentCondition["status"] = condition["status"]
+              currentCondition["lastTransitionTime"] = condition["lastTransitionTime"]
+              item["status"]["conditions"].push(currentCondition)
+            end
+          end
+
           item["status"]["nodeInfo"] = {}
           nodeInfo = {}
           if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty?
@@ -1100,8 +1115,26 @@ def getNodeOptimizedItem(resourceItem)
             nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"]
           end
           item["status"]["nodeInfo"] = nodeInfo
-          item["status"]["allocatable"] = resourceItem["status"]["allocatable"]
-          item["status"]["capacity"] = resourceItem["status"]["capacity"]
+
+          item["status"]["allocatable"] = {}
+          nodeAllocatable = {}
+          if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty?
+            nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
+            nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"]
+            nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
+            nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+          end
+          item["status"]["allocatable"] = nodeAllocatable
+
+          item["status"]["capacity"] = {}
+          nodeCapacity = {}
+          if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty?
+            nodeCapacity["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
+            nodeCapacity["memory"] = resourceItem["status"]["allocatable"]["memory"]
+            nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
+            nodeCapacity["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+          end
+          item["status"]["capacity"] = nodeCapacity
         end
       rescue => errorStr
         @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}"

From ac88379590f7018a48be29f7bafe61838f26c984 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 9 Jan 2022 19:35:15 -0800
Subject: [PATCH 11/65] bugfix related typo

---
 source/plugins/ruby/KubernetesApiClient.rb | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 1bfa780d9..594735eee 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -1104,7 +1104,6 @@ def getNodeOptimizedItem(resourceItem)
             end
           end
 
-          item["status"]["nodeInfo"] = {}
           nodeInfo = {}
           if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty?
             nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"]
@@ -1116,7 +1115,6 @@ def getNodeOptimizedItem(resourceItem)
           end
           item["status"]["nodeInfo"] = nodeInfo
 
-          item["status"]["allocatable"] = {}
           nodeAllocatable = {}
           if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty?
             nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
@@ -1126,13 +1124,12 @@ def getNodeOptimizedItem(resourceItem)
           end
           item["status"]["allocatable"] = nodeAllocatable
 
-          item["status"]["capacity"] = {}
           nodeCapacity = {}
           if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty?
-            nodeCapacity["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
-            nodeCapacity["memory"] = resourceItem["status"]["allocatable"]["memory"]
-            nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
-            nodeCapacity["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+            nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"]
+            nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"]
+            nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"]
+            nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"]
           end
           item["status"]["capacity"] = nodeCapacity
         end

From 5835f4284f007a21e8202d8d1a793634c016d78f Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 10 Jan 2022 19:14:16 -0800
Subject: [PATCH 12/65] node allocatable cache

---
 source/plugins/ruby/KubernetesApiClient.rb  |  76 ++++---
 source/plugins/ruby/in_kube_nodes.rb        |  10 +-
 source/plugins/ruby/in_kube_podinventory.rb | 207 ++++++++++++++++++--
 3 files changed, 245 insertions(+), 48 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 594735eee..194388d9f 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -37,7 +37,6 @@ class KubernetesApiClient
   @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
   @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token"
   @@TokenStr = nil
-  @@NodeMetrics = Hash.new
   @@WinNodeArray = []
   @@telemetryTimeTracker = DateTime.now.to_time.to_i
   @@resourceLimitsTelemetryHash = {}
@@ -411,7 +410,7 @@ def getPodUid(podNameSpace, podMetadata)
       return podUid
     end
 
-    def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+    def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601)
       metricItems = []
       begin
         clusterId = getClusterId
@@ -466,11 +465,8 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
               metricItems.push(metricProps)
               #No container level limit for the given metric, so default to node level limit
             else
-              nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
-              if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey))
-                metricValue = @@NodeMetrics[nodeMetricsHashKey]
-                #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ")
-
+              if (metricCategory == "limits" && !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect))
+                metricValue = nodeAllocatableRecord[metricNameToCollect]
                 metricProps = {}
                 metricProps["Timestamp"] = metricTime
                 metricProps["Host"] = nodeName
@@ -498,7 +494,7 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle
       return metricItems
     end #getContainerResourceRequestAndLimits
 
-    def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+    def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601)
       metricItems = []
       begin
         clusterId = getClusterId
@@ -543,8 +539,9 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory,
             else
               #No container level limit for the given metric, so default to node level limit for non-gpu metrics
               if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
-                nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
-                metricValue = @@NodeMetrics[nodeMetricsHashKey]
+                if !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect)
+                  metricValue = nodeAllocatableRecord[metricNameToCollect]
+                end
               end
             end
             if (!metricValue.nil?)
@@ -621,11 +618,6 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri
 
           metricItem["json_Collections"] = []
           metricItem["json_Collections"] = metricCollections.to_json
-
-          #push node level metrics to a inmem hash so that we can use it looking up at container level.
-          #Currently if container level cpu & memory limits are not defined we default to node level limits
-          @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
-          #@Log.info ("Node metric hash: #{@@NodeMetrics}")
         end
       rescue => error
         @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
@@ -659,13 +651,6 @@ def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect,
           metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect
 
           metricItem["Tags"] = metricTags
-
-          #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
-          #Currently if container level cpu & memory limits are not defined we default to node level limits
-          if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
-            @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
-            #@Log.info ("Node metric hash: #{@@NodeMetrics}")
-          end
         end
       rescue => error
         @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
@@ -917,6 +902,22 @@ def getServiceOptimizedItem(resourceItem)
       return item
     end
 
+    def isWindowsNodeItem(nodeResourceItem)
+      isWindowsNodeItem = false
+      begin
+        nodeStatus = nodeResourceItem["status"]
+        if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
+          operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
+          if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
+            isWindowsNodeItem = true
+          end
+        end
+      rescue => errorStr
+        $Log.warn "KubernetesApiClient::::isWindowsNodeItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
+      end
+      return isWindowsNodeItem
+    end
+
     def isWindowsPodItem(podItem)
       isWindowsPod = false
       begin
@@ -1071,6 +1072,21 @@ def getPodOptimizedItem(resourceItem)
       return item
     end
 
+    def getNodeAllocatableValues(nodeResourceItem)
+      nodeAllocatable = {}
+      begin
+        if !nodeResourceItem["status"].nil? &&
+           !nodeResourceItem["status"]["allocatable"].nil? &&
+           !nodeResourceItem["status"]["allocatable"].empty?
+          nodeAllocatable["cpu"] = nodeResourceItem["status"]["allocatable"]["cpu"]
+          nodeAllocatable["memory"] = nodeResourceItem["status"]["allocatable"]["memory"]
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getNodeAllocatableValues:Failed with an error : #{errorStr}"
+      end
+      return nodeAllocatable
+    end
+
     def getNodeOptimizedItem(resourceItem)
       item = {}
       begin
@@ -1119,8 +1135,12 @@ def getNodeOptimizedItem(resourceItem)
           if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty?
             nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"]
             nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"]
-            nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
-            nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+            if !resourceItem["status"]["allocatable"]["nvidia.com/gpu"].nil?
+              nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"]
+            end
+            if !resourceItem["status"]["allocatable"]["amd.com/gpu"].nil?
+              nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"]
+            end
           end
           item["status"]["allocatable"] = nodeAllocatable
 
@@ -1128,8 +1148,12 @@ def getNodeOptimizedItem(resourceItem)
           if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty?
             nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"]
             nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"]
-            nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"]
-            nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"]
+            if !resourceItem["status"]["capacity"]["nvidia.com/gpu"].nil?
+              nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"]
+            end
+            if !resourceItem["status"]["capacity"]["amd.com/gpu"].nil?
+              nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"]
+            end
           end
           item["status"]["capacity"] = nodeCapacity
         end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 997167780..a7d9a8f6d 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -207,9 +207,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
           nodeInventoryRecord = getNodeInventoryRecord(item, batchTime)
           eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord
           if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@tag, eventStream) if eventStream
-            $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
               $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
@@ -222,7 +222,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
           containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord
 
           if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream
             containerNodeInventoryEventStream = Fluent::MultiEventStream.new
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
@@ -271,7 +271,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
             kubePerfEventStream.add(emitTime, metricRecord) if metricRecord
           end
           if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
             kubePerfEventStream = Fluent::MultiEventStream.new
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
@@ -301,7 +301,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
             insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
           end
           if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
             insightsMetricsEventStream = Fluent::MultiEventStream.new
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 4103fcd33..acff8a591 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -33,6 +33,7 @@ def initialize
       # this configurable via configmap
       @PODS_CHUNK_SIZE = 0
       @PODS_EMIT_STREAM_BATCH_SIZE = 0
+      @NODES_CHUNK_SIZE = 0
 
       @podCount = 0
       @serviceCount = 0
@@ -47,6 +48,10 @@ def initialize
       @watchServicesThread = nil
       @serviceItemsCache = {}
 
+      @watchNodesThread = nil
+      @nodeAllocatableCache = {}
+      @windowsNodeCache = {}
+
       @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
       @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB"
@@ -81,14 +86,27 @@ def start
           @PODS_EMIT_STREAM_BATCH_SIZE = 200
         end
         $log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE  @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
+
+        if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0
+          @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i
+        else
+          # this shouldnt happen just setting default here as safe guard
+          $log.warn("in_kube_podinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty")
+          @NODES_CHUNK_SIZE = 250
+        end
+        $log.info("in_kube_podinventory::start : NODES_CHUNK_SIZE  @ #{@NODES_CHUNK_SIZE}")
+
         @finished = false
         @condition = ConditionVariable.new
         @mutex = Mutex.new
         @podCacheMutex = Mutex.new
         @serviceCacheMutex = Mutex.new
+        @nodeAllocatableCacheMutex = Mutex.new
+        #  @windowsNodeCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
         @watchPodsThread = Thread.new(&method(:watch_pods))
         @watchServicesThread = Thread.new(&method(:watch_services))
+        @watchNodesThread = Thread.new(&method(:watch_nodes))
         @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
       end
     end
@@ -101,6 +119,8 @@ def shutdown
         }
         @thread.join
         @watchPodsThread.join
+        @watchServicesThread.join
+        @watchNodesThread.join
         super # This super must be at the end of shutdown method
       end
     end
@@ -156,6 +176,15 @@ def enumerate(podList = nil)
         @serviceCount = serviceRecords.length
         $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}")
 
+        nodeAllocatableRecords = {}
+        nodeAllocatableCacheSizeKB = 0
+        @nodeAllocatableCacheMutex.synchronize {
+          nodeAllocatableRecords = @nodeAllocatableCache.clone
+        }
+        if KubernetesApiClient.isEmitCacheTelemetry()
+          nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024
+        end
+        $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
         # to track e2e processing latency
         @podsAPIE2ELatencyMs = 0
         podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
@@ -173,7 +202,7 @@ def enumerate(podList = nil)
         @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
         if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
           $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-          parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
+          parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime)
         else
           $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
         end
@@ -181,6 +210,7 @@ def enumerate(podList = nil)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
         serviceRecords = nil
+        nodeAllocatableRecords = nil
 
         # Adding telemetry to send pod telemetry every 5 minutes
         timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -198,6 +228,7 @@ def enumerate(podList = nil)
           if KubernetesApiClient.isEmitCacheTelemetry()
             telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
             telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
+            telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB
           end
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
           ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
@@ -219,7 +250,7 @@ def enumerate(podList = nil)
       end
     end
 
-    def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601)
+    def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601)
       currentTime = Time.now
       emitTime = Fluent::Engine.now
       #batchTime = currentTime.utc.iso8601
@@ -243,11 +274,11 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           end
           # Setting this flag to true so that we can send ContainerInventory records for containers
           # on windows nodes and parse environment variables for these containers
+          nodeName = ""
+          if !item["spec"]["nodeName"].nil?
+            nodeName = item["spec"]["nodeName"]
+          end
           if winNodes.length > 0
-            nodeName = ""
-            if !item["spec"]["nodeName"].nil?
-              nodeName = item["spec"]["nodeName"]
-            end
             if (!nodeName.empty? && (winNodes.include? nodeName))
               clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
               #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
@@ -263,7 +294,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           end
 
           if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
               $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
             end
@@ -271,19 +302,23 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
             eventStream = Fluent::MultiEventStream.new
           end
 
+          nodeAllocatableRecord = {}
+          if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
+            nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
+          end
           #container perf records
           containerMetricDataItems = []
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
 
           containerMetricDataItems.each do |record|
             kubePerfEventStream.add(emitTime, record) if record
           end
 
           if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
             router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
               $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
@@ -293,16 +328,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
 
           # container GPU records
           containerGPUInsightsMetricsDataItems = []
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
           containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
             insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
           end
 
           if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
             if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
               $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
             end
@@ -924,5 +959,143 @@ def watch_services
       end
       $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}")
     end
+
+    def watch_nodes
+      $log.info("in_kube_podinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}")
+      nodesResourceVersion = nil
+      loop do
+        begin
+          if nodesResourceVersion.nil?
+            # clear node limits cache before filling the cache with list
+            @nodeAllocatableCacheMutex.synchronize {
+              @nodeAllocatableCache.clear()
+            }
+            # @windowsNodeCacheMutex.synchronize {
+            #   @windowsNodeCache.clear()
+            # }
+            continuationToken = nil
+            $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
+            $log.info("in_kube_podinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            if (!nodeInventory.nil? && !nodeInventory.empty?)
+              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                $log.info("in_kube_podinventory::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                nodeInventory["items"].each do |item|
+                  key = item["metadata"]["name"]
+                  if !key.nil? && !key.empty?
+                    nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                    if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                      @nodeAllocatableCacheMutex.synchronize {
+                        @nodeAllocatableCache[key] = nodeAllocatable
+                      }
+                    else
+                      $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                    end
+                  else
+                    $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                  end
+                end
+              end
+            else
+              $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory"
+            end
+            while (!continuationToken.nil? && !continuationToken.empty?)
+              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
+              if (!nodeInventory.nil? && !nodeInventory.empty?)
+                nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                  $log.info("in_kube_podinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  nodeInventory["items"].each do |item|
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                        @nodeAllocatableCacheMutex.synchronize {
+                          @nodeAllocatableCache[key] = nodeAllocatable
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                    end
+                  end
+                end
+              else
+                $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory"
+              end
+            end
+          end
+          begin
+            $log.info("in_kube_podinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_podinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    nodesResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_podinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_podinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    nodesResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                        @nodeAllocatableCacheMutex.synchronize {
+                          @nodeAllocatableCache[key] = nodeAllocatable
+                        }
+                      else
+                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                      end
+                    else
+                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      @nodeAllocatableCacheMutex.synchronize {
+                        @nodeAllocatableCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
+                  nodesResourceVersion = nil
+                  $log.warn("in_kube_podinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                  break
+                else
+                  $log.warn("in_kube_podinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                end
+              end
+            end
+          rescue Net::ReadTimeout => errorStr
+            $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
+          end
+        rescue => errorStr
+          $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          nodesResourceVersion = nil
+        end
+      end
+      $log.info("in_kube_podinventory::watch_nodes:End @ #{Time.now.utc.iso8601}")
+    end
   end # Kube_Pod_Input
 end # module

From 12f9754bc98261eae3b7122032bb68866f09ca64 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 12 Jan 2022 12:22:40 -0800
Subject: [PATCH 13/65] wincontainerinventory in multiproc

---
 kubernetes/omsagent.yaml                    |   8 +-
 source/plugins/ruby/KubernetesApiClient.rb  |  87 +++++++++------
 source/plugins/ruby/in_kube_podinventory.rb | 116 ++++++++++++++++----
 3 files changed, 152 insertions(+), 59 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index d5545f041..1a4caf7dd 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -368,7 +368,7 @@ spec:
             value: "3"
       containers:
         - name: omsagent
-          image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021"
+          image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022"
           imagePullPolicy: IfNotPresent
           resources:
             limits:
@@ -456,7 +456,7 @@ spec:
             timeoutSeconds: 15
 #Only in sidecar scraping mode
         # - name: omsagent-prometheus
-        #   image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021"
+        #   image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022"
         #   imagePullPolicy: IfNotPresent
         #   resources:
         #     limits:
@@ -605,7 +605,7 @@ spec:
       serviceAccountName: omsagent
       containers:
         - name: omsagent
-          image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021"
+          image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022"
           imagePullPolicy: IfNotPresent
           resources:
             limits:
@@ -615,6 +615,8 @@ spec:
               cpu: 150m
               memory: 250Mi
           env:
+            - name: EMIT_CACHE_TELEMETRY
+              value: "true"
             - name: NUM_OF_FLUENTD_WORKERS
               value: "3" # This value should be same as number of CPU cores specified under limits
             - name: AKS_RESOURCE_ID
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 194388d9f..1a7444b28 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -37,7 +37,6 @@ class KubernetesApiClient
   @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
   @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token"
   @@TokenStr = nil
-  @@WinNodeArray = []
   @@telemetryTimeTracker = DateTime.now.to_time.to_i
   @@resourceLimitsTelemetryHash = {}
 
@@ -293,8 +292,6 @@ def getWindowsNodes
         resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows")
         nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body)
         @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api"
-        # Resetting the windows node cache
-        @@WinNodeArray.clear
         if (!nodeInventory.empty?)
           nodeInventory["items"].each do |item|
             # check for windows operating system in node metadata
@@ -304,11 +301,6 @@ def getWindowsNodes
             if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
               operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
               if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
-                # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data
-                # to get images and image tags for containers in windows nodes
-                if !nodeMetadata.nil? && !nodeMetadata["name"].nil?
-                  @@WinNodeArray.push(nodeMetadata["name"])
-                end
                 nodeStatusAddresses = nodeStatus["addresses"]
                 if !nodeStatusAddresses.nil?
                   nodeStatusAddresses.each do |address|
@@ -328,7 +320,33 @@ def getWindowsNodes
     end
 
     def getWindowsNodesArray
-      return @@WinNodeArray
+      winNodeArray = []
+      begin
+        # get only windows nodes
+        resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows")
+        nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body)
+        @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api"
+        if (!nodeInventory.empty?)
+          nodeInventory["items"].each do |item|
+            # check for windows operating system in node metadata
+            nodeStatus = item["status"]
+            nodeMetadata = item["metadata"]
+            if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil?
+              operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"]
+              if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0)
+                # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data
+                # to get images and image tags for containers in windows nodes
+                if !nodeMetadata.nil? && !nodeMetadata["name"].nil?
+                  winNodeArray.push(nodeMetadata["name"])
+                end
+              end
+            end
+          end
+        end
+      rescue => error
+        @Log.warn("KubernetesApiClient::getWindowsNodesArray:failed with an error: #{error}")
+      end
+      return winNodeArray
     end
 
     def getContainerIDs(namespace)
@@ -856,10 +874,10 @@ def watch(resource_name, options = {})
       end
     end
 
-    def getOptimizedItem(resource, resourceItem)
+    def getOptimizedItem(resource, resourceItem, isWindowsItem = false)
       case resource
       when "pods"
-        return getPodOptimizedItem(resourceItem)
+        return getPodOptimizedItem(resourceItem, isWindowsItem)
       when "nodes"
         return getNodeOptimizedItem(resourceItem)
       when "services"
@@ -918,23 +936,23 @@ def isWindowsNodeItem(nodeResourceItem)
       return isWindowsNodeItem
     end
 
-    def isWindowsPodItem(podItem)
-      isWindowsPod = false
-      begin
-        winNodes = KubernetesApiClient.getWindowsNodesArray()
-        if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0
-          nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
-          if !nodeName.empty? && winNodes.include?(nodeName)
-            isWindowsPod = true
-          end
-        end
-      rescue => errorStr
-        $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
-      end
-      return isWindowsPod
-    end
-
-    def getPodOptimizedItem(resourceItem)
+    # def isWindowsPodItem(podItem)
+    #   isWindowsPod = false
+    #   begin
+    #     winNodes = KubernetesApiClient.getWindowsNodesArray()
+    #     if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0
+    #       nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
+    #       if !nodeName.empty? && winNodes.include?(nodeName)
+    #         isWindowsPod = true
+    #       end
+    #     end
+    #   rescue => errorStr
+    #     $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
+    #   end
+    #   return isWindowsPod
+    # end
+
+    def getPodOptimizedItem(resourceItem, isWindowsPodItem)
       item = {}
       begin
         item["metadata"] = {}
@@ -961,7 +979,7 @@ def getPodOptimizedItem(resourceItem)
             item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"]
           end
         end
-        isWindowsPod = isWindowsPodItem(resourceItem)
+
         item["spec"] = {}
         if !resourceItem["spec"].nil?
           item["spec"]["containers"] = []
@@ -976,7 +994,7 @@ def getPodOptimizedItem(resourceItem)
               currentContainer["name"] = container["name"]
               currentContainer["resources"] = container["resources"]
               # fields required for windows containers records
-              if isWindowsPod
+              if isWindowsPodItem
                 currentContainer["image"] = container["image"]
                 currentContainer["ports"] = container["ports"]
                 currentContainer["command"] = container["command"]
@@ -995,7 +1013,7 @@ def getPodOptimizedItem(resourceItem)
               currentContainer["name"] = container["name"]
               currentContainer["resources"] = container["resources"]
               # fields required for windows containers records
-              if isWindowsPod
+              if isWindowsPodItem
                 currentContainer["image"] = container["image"]
                 currentContainer["ports"] = container["ports"]
                 currentContainer["command"] = container["command"]
@@ -1059,12 +1077,17 @@ def getPodOptimizedItem(resourceItem)
               currentContainerStatus["restartCount"] = containerStatus["restartCount"]
               currentContainerStatus["state"] = containerStatus["state"]
               currentContainerStatus["lastState"] = containerStatus["lastState"]
-              if isWindowsPod
+              if isWindowsPodItem
                 currentContainerStatus["imageID"] = containerStatus["imageID"]
               end
               item["status"]["containerStatuses"].push(currentContainerStatus)
             end
           end
+          # this metadata used to identify the pod scheduled onto windows node
+          # so that pod inventory can make decision to extract containerinventory records or not
+          if isWindowsPodItem
+            item["isWindows"] = "true"
+          end
         end
       rescue => errorStr
         @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}"
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index acff8a591..2bfc98adb 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -50,7 +50,8 @@ def initialize
 
       @watchNodesThread = nil
       @nodeAllocatableCache = {}
-      @windowsNodeCache = {}
+      @windowsNodeNameListCache = []
+      @windowsContainerRecordsCacheSizeBytes = 0
 
       @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
@@ -102,11 +103,11 @@ def start
         @podCacheMutex = Mutex.new
         @serviceCacheMutex = Mutex.new
         @nodeAllocatableCacheMutex = Mutex.new
-        #  @windowsNodeCacheMutex = Mutex.new
+        @windowsNodeNameCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
+        @watchNodesThread = Thread.new(&method(:watch_nodes))
         @watchPodsThread = Thread.new(&method(:watch_pods))
         @watchServicesThread = Thread.new(&method(:watch_services))
-        @watchNodesThread = Thread.new(&method(:watch_nodes))
         @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
       end
     end
@@ -133,6 +134,7 @@ def enumerate(podList = nil)
         @serviceCount = 0
         @controllerSet = Set.new []
         @winContainerCount = 0
+        @windowsContainerRecordsCacheSizeBytes = 0
         @controllerData = {}
         currentTime = Time.now
         batchTime = currentTime.utc.iso8601
@@ -229,6 +231,7 @@ def enumerate(podList = nil)
             telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
             telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
             telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB
+            telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024
           end
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
           ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {})
@@ -261,8 +264,8 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords,
       @@istestvar = ENV["ISTEST"]
 
       begin #begin block start
-        # Getting windows nodes from kubeapi
-        winNodes = KubernetesApiClient.getWindowsNodesArray
+        # # Getting windows nodes from kubeapi
+        # winNodes = KubernetesApiClient.getWindowsNodesArray
         podInventory["items"].each do |item| #podInventory block start
           # pod inventory records
           podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime)
@@ -278,17 +281,18 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords,
           if !item["spec"]["nodeName"].nil?
             nodeName = item["spec"]["nodeName"]
           end
-          if winNodes.length > 0
-            if (!nodeName.empty? && (winNodes.include? nodeName))
-              clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
-              #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
-              containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true)
-              # Send container inventory records for containers on windows nodes
-              @winContainerCount += containerInventoryRecords.length
-              containerInventoryRecords.each do |cirecord|
-                if !cirecord.nil?
-                  containerInventoryStream.add(emitTime, cirecord) if cirecord
-                end
+          if (!item["isWindows"].nil? && !item["isWindows"].empty? && item["isWindows"].downcase == "true")
+            clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
+            #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel
+            containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true)
+            if KubernetesApiClient.isEmitCacheTelemetry()
+              @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length
+            end
+            # Send container inventory records for containers on windows nodes
+            @winContainerCount += containerInventoryRecords.length
+            containerInventoryRecords.each do |cirecord|
+              if !cirecord.nil?
+                containerInventoryStream.add(emitTime, cirecord) if cirecord
               end
             end
           end
@@ -711,8 +715,13 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords)
     def watch_pods
       $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
       podsResourceVersion = nil
-      # invoke getWindowsNodes to get windowsnodearray cache populated
-      KubernetesApiClient.getWindowsNodes()
+      # invoke getWindowsNodes to handle scenario where windowsNodeNameCache not populated yet on containerstart
+      winNodes = KubernetesApiClient.getWindowsNodesArray()
+      if winNodes.length > 0
+        @windowsNodeNameCacheMutex.synchronize {
+          @windowsNodeNameListCache = winNodes.dup
+        }
+      end
       loop do
         begin
           if podsResourceVersion.nil?
@@ -720,6 +729,10 @@ def watch_pods
             @podCacheMutex.synchronize {
               @podItemsCache.clear()
             }
+            currentWindowsNodeNameList = []
+            @windowsNodeNameCacheMutex.synchronize {
+              currentWindowsNodeNameList = @windowsNodeNameListCache.dup
+            }
             continuationToken = nil
             $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
             continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
@@ -731,7 +744,12 @@ def watch_pods
                 podInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
                   if !key.nil? && !key.empty?
-                    podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                    nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+                    isWindowsPodItem = false
+                    if !nodeName.empty? && !currentWindowsNodeNameList.nil? && !currentWindowsNodeNameList.empty? && currentWindowsNodeNameList.include?(nodeName)
+                      isWindowsPodItem = true
+                    end
+                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
                     if !podItem.nil? && !podItem.empty?
                       @podCacheMutex.synchronize {
                         @podItemsCache[key] = podItem
@@ -756,7 +774,15 @@ def watch_pods
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+                      isWindowsPodItem = false
+                      if !nodeName.empty? &&
+                         !currentWindowsNodeNameList.nil? &&
+                         !currentWindowsNodeNameList.empty? &&
+                         currentWindowsNodeNameList.include?(nodeName)
+                        isWindowsPodItem = true
+                      end
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem
@@ -799,7 +825,19 @@ def watch_pods
                   if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      currentWindowsNodeNameList = []
+                      @windowsNodeNameCacheMutex.synchronize {
+                        currentWindowsNodeNameList = @windowsNodeNameListCache.dup
+                      }
+                      isWindowsPodItem = false
+                      nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+                      if !nodeName.empty? &&
+                         !currentWindowsNodeNameList.nil? &&
+                         !currentWindowsNodeNameList.empty? &&
+                         currentWindowsNodeNameList.include?(nodeName)
+                        isWindowsPodItem = true
+                      end
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem
@@ -970,9 +1008,9 @@ def watch_nodes
             @nodeAllocatableCacheMutex.synchronize {
               @nodeAllocatableCache.clear()
             }
-            # @windowsNodeCacheMutex.synchronize {
-            #   @windowsNodeCache.clear()
-            # }
+            @windowsNodeNameCacheMutex.synchronize {
+              @windowsNodeNameListCache.clear()
+            }
             continuationToken = nil
             $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
             resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
@@ -986,6 +1024,14 @@ def watch_nodes
                   key = item["metadata"]["name"]
                   if !key.nil? && !key.empty?
                     nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
+                    if isWindowsNodeItem
+                      @windowsNodeNameCacheMutex.synchronize {
+                        if !@windowsNodeNameListCache.include?(key)
+                          @windowsNodeNameListCache.push(key)
+                        end
+                      }
+                    end
                     if !nodeAllocatable.nil? && !nodeAllocatable.empty?
                       @nodeAllocatableCacheMutex.synchronize {
                         @nodeAllocatableCache[key] = nodeAllocatable
@@ -1011,6 +1057,14 @@ def watch_nodes
                     key = item["metadata"]["name"]
                     if !key.nil? && !key.empty?
                       nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
+                      if isWindowsNodeItem
+                        @windowsNodeNameCacheMutex.synchronize {
+                          if !@windowsNodeNameListCache.include?(key)
+                            @windowsNodeNameListCache.push(key)
+                          end
+                        }
+                      end
                       if !nodeAllocatable.nil? && !nodeAllocatable.empty?
                         @nodeAllocatableCacheMutex.synchronize {
                           @nodeAllocatableCache[key] = nodeAllocatable
@@ -1052,6 +1106,14 @@ def watch_nodes
                   end
                   if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
                     key = item["metadata"]["name"]
+                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
+                    if isWindowsNodeItem
+                      @windowsNodeNameCacheMutex.synchronize {
+                        if !@windowsNodeNameListCache.include?(key)
+                          @windowsNodeNameListCache.push(key)
+                        end
+                      }
+                    end
                     if !key.nil? && !key.empty?
                       nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
                       if !nodeAllocatable.nil? && !nodeAllocatable.empty?
@@ -1066,6 +1128,12 @@ def watch_nodes
                     end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["name"]
+                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
+                    if isWindowsNodeItem
+                      @windowsNodeNameCacheMutex.synchronize {
+                        @windowsNodeNameListCache.delete(key)
+                      }
+                    end
                     if !key.nil? && !key.empty?
                       @nodeAllocatableCacheMutex.synchronize {
                         @nodeAllocatableCache.delete(key)

From 5da266fdbd1cff63588081ad068c4e3ee4ac7985 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 14 Jan 2022 18:42:22 -0800
Subject: [PATCH 14/65] disable health

---
 build/linux/installer/conf/kube.conf | 80 ++++++++++++++--------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 10a271d99..0a01d63f2 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -231,12 +231,12 @@
   </worker>
   <worker "#{ENV['FLUENTD_OTHER_INVENTORY_WORKER_ID']}">
     #fluent forward plugin
-    <source>
-     @type forward
-     port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
-     bind 0.0.0.0
-     chunk_size_limit 4m
-    </source>
+    # <source>
+    #  @type forward
+    #  port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}"
+    #  bind 0.0.0.0
+    #  chunk_size_limit 4m
+    # </source>
 
     #Kubernetes Persistent Volume inventory
     <source>
@@ -254,13 +254,13 @@
      @log_level debug
     </source>
 
-    #Kubernetes health
-    <source>
-     @type kube_health
-     tag kubehealth.ReplicaSet
-     run_interval 60
-     @log_level debug
-    </source>
+    # #Kubernetes health
+    # <source>
+    #  @type kube_health
+    #  tag kubehealth.ReplicaSet
+    #  run_interval 60
+    #  @log_level debug
+    # </source>
 
     #cadvisor perf- Windows nodes
     <source>
@@ -287,9 +287,9 @@
     </source>
 
     #health model aggregation filter
-    <filter kubehealth**>
-     @type health_model_builder
-    </filter>
+    # <filter kubehealth**>
+    #  @type health_model_builder
+    # </filter>
 
     #kubepvinventory
     <match **KUBE_PV_INVENTORY_BLOB**>
@@ -371,30 +371,30 @@
     </match>
 
     #kubehealth
-    <match **KUBE_HEALTH_BLOB**>
-     @type forward
-     @log_level debug
-     send_timeout 30
-     connect_timeout 30
-     heartbeat_type none
-     <server>
-       host 0.0.0.0
-       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-     </server>
-     <buffer>
-      @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer
-      overflow_action drop_oldest_chunk
-      chunk_limit_size 4m
-      queue_limit_length 20
-      flush_interval 20s
-      retry_max_times 10
-      retry_wait 5s
-      retry_max_interval 5m
-      flush_thread_count 5
-     </buffer>
-     keepalive true
-    </match>
+    # <match **KUBE_HEALTH_BLOB**>
+    #  @type forward
+    #  @log_level debug
+    #  send_timeout 30
+    #  connect_timeout 30
+    #  heartbeat_type none
+    #  <server>
+    #    host 0.0.0.0
+    #    port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+    #  </server>
+    #  <buffer>
+    #   @type file
+    #   path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer
+    #   overflow_action drop_oldest_chunk
+    #   chunk_limit_size 4m
+    #   queue_limit_length 20
+    #   flush_interval 20s
+    #   retry_max_times 10
+    #   retry_wait 5s
+    #   retry_max_interval 5m
+    #   flush_thread_count 5
+    #  </buffer>
+    #  keepalive true
+    # </match>
 
     <match mdm.cadvisorperf**>
      @type mdm

From 658a4403bde92f854a01dba81a3059dcddeaa2df Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 16 Jan 2022 20:44:08 -0800
Subject: [PATCH 15/65] config events on different core

---
 build/linux/installer/conf/kube.conf | 69 ++++++++++++++--------------
 kubernetes/linux/main.sh             | 12 +++++
 kubernetes/omsagent.yaml             |  6 +--
 3 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 0a01d63f2..28a07c223 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -229,6 +229,41 @@
      retry_mdm_post_wait_minutes 30
     </match>
   </worker>
+  <worker "#{ENV['FLUENTD_EVENT_INVENTORY_WORKER_ID']}">
+    #Kubernetes events
+    <source>
+     @type kube_events
+     tag oneagent.containerInsights.KUBE_EVENTS_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
+    #kubeevents
+    <match **KUBE_EVENTS_BLOB**>
+     @type forward
+     @log_level debug
+     send_timeout 30
+     connect_timeout 30
+     heartbeat_type none
+     <server>
+       host 0.0.0.0
+       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
+     </server>
+     <buffer>
+      @type file
+      path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
+      overflow_action drop_oldest_chunk
+      chunk_limit_size 4m
+      queue_limit_length 20
+      flush_interval 20s
+      retry_max_times 10
+      retry_wait 5s
+      retry_max_interval 5m
+      flush_thread_count 5
+     </buffer>
+     keepalive true
+    </match>
+  </worker>
   <worker "#{ENV['FLUENTD_OTHER_INVENTORY_WORKER_ID']}">
     #fluent forward plugin
     # <source>
@@ -246,14 +281,6 @@
      @log_level debug
     </source>
 
-    #Kubernetes events
-    <source>
-     @type kube_events
-     tag oneagent.containerInsights.KUBE_EVENTS_BLOB
-     run_interval 60
-     @log_level debug
-    </source>
-
     # #Kubernetes health
     # <source>
     #  @type kube_health
@@ -344,32 +371,6 @@
      keepalive true
     </match>
 
-    #kubeevents
-    <match **KUBE_EVENTS_BLOB**>
-     @type forward
-     @log_level debug
-     send_timeout 30
-     connect_timeout 30
-     heartbeat_type none
-     <server>
-       host 0.0.0.0
-       port  "#{ENV['MDSD_FLUENT_SOCKET_PORT']}"
-     </server>
-     <buffer>
-      @type file
-      path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
-      overflow_action drop_oldest_chunk
-      chunk_limit_size 4m
-      queue_limit_length 20
-      flush_interval 20s
-      retry_max_times 10
-      retry_wait 5s
-      retry_max_interval 5m
-      flush_thread_count 5
-     </buffer>
-     keepalive true
-    </match>
-
     #kubehealth
     # <match **KUBE_HEALTH_BLOB**>
     #  @type forward
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 023cc11e4..00301a969 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -588,16 +588,25 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
             case $NUM_OF_FLUENTD_WORKERS in
+            4)
+                  export NUM_OF_FLUENTD_WORKERS=4
+                  export FLUENTD_POD_INVENTORY_WORKER_ID=3
+                  export FLUENTD_NODE_INVENTORY_WORKER_ID=2
+                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
+                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+                  ;;
             3)
                   export NUM_OF_FLUENTD_WORKERS=3
                   export FLUENTD_POD_INVENTORY_WORKER_ID=2
                   export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
                   export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
             2)
                  export NUM_OF_FLUENTD_WORKERS=2
                  export FLUENTD_POD_INVENTORY_WORKER_ID=1
                  export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
 
@@ -605,12 +614,14 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
                  export NUM_OF_FLUENTD_WORKERS=1
                  export FLUENTD_POD_INVENTORY_WORKER_ID=0
                  export FLUENTD_NODE_INVENTORY_WORKER_ID=0
+                 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
             esac
             echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
             echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
             echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
+            echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
             echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
             source ~/.bashrc
 
@@ -618,6 +629,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "num of workers:${NUM_OF_FLUENTD_WORKERS}"
             echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
             echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
+            echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
             echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
 
             echo "*** starting fluentd v1 in replicaset"
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 1a4caf7dd..8a4532035 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -609,8 +609,8 @@ spec:
           imagePullPolicy: IfNotPresent
           resources:
             limits:
-              cpu: 3
-              memory: 1.5Gi
+              cpu: 4
+              memory: 2Gi
             requests:
               cpu: 150m
               memory: 250Mi
@@ -618,7 +618,7 @@ spec:
             - name: EMIT_CACHE_TELEMETRY
               value: "true"
             - name: NUM_OF_FLUENTD_WORKERS
-              value: "3" # This value should be same as number of CPU cores specified under limits
+              value: "4" # This value should be same as number of CPU cores specified under limits
             - name: AKS_RESOURCE_ID
               value: "VALUE_AKS_RESOURCE_ID_VALUE"
             - name: AKS_REGION

From b8b8d181530add37e908d45ce5906b1b1e856c24 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 17 Jan 2022 10:44:43 -0800
Subject: [PATCH 16/65] add ts to logs

---
 source/plugins/ruby/in_kube_nodes.rb        | 16 ++++-----
 source/plugins/ruby/in_kube_podinventory.rb | 40 ++++++++++-----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index a7d9a8f6d..3746dc224 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -599,15 +599,15 @@ def watch_nodes
                         @nodeItemsCache[key] = nodeItem
                       }
                     else
-                      $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                      $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   else
-                    $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                    $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
                   end
                 end
               end
             else
-              $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory"
+              $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
             end
             while (!continuationToken.nil? && !continuationToken.empty?)
               continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
@@ -624,15 +624,15 @@ def watch_nodes
                           @nodeItemsCache[key] = nodeItem
                         }
                       else
-                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
               else
-                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory"
+                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
               end
             end
           end
@@ -667,10 +667,10 @@ def watch_nodes
                           @nodeItemsCache[key] = nodeItem
                         }
                       else
-                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty"
+                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty"
+                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["uid"]
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 2bfc98adb..36d390cb4 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -755,10 +755,10 @@ def watch_pods
                         @podItemsCache[key] = podItem
                       }
                     else
-                      $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil"
+                      $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
                     end
                   else
-                    $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                    $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
                   end
                 end
               end
@@ -788,15 +788,15 @@ def watch_pods
                           @podItemsCache[key] = podItem
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil"
+                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
               else
-                $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
+                $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
               end
             end
           end
@@ -843,10 +843,10 @@ def watch_pods
                           @podItemsCache[key] = podItem
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil"
+                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["uid"]
@@ -915,15 +915,15 @@ def watch_services
                           @serviceItemsCache[key] = serviceItem
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty"
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
               else
-                $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory"
+                $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory  @ #{Time.now.utc.iso8601}"
               end
               serviceInventory = nil
             end
@@ -959,10 +959,10 @@ def watch_services
                           @serviceItemsCache[key] = serviceItem
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty"
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["uid"]
@@ -1037,15 +1037,15 @@ def watch_nodes
                         @nodeAllocatableCache[key] = nodeAllocatable
                       }
                     else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                      $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   else
-                    $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                    $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                   end
                 end
               end
             else
-              $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory"
+              $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
             end
             while (!continuationToken.nil? && !continuationToken.empty?)
               continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
@@ -1070,15 +1070,15 @@ def watch_nodes
                           @nodeAllocatableCache[key] = nodeAllocatable
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
               else
-                $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory"
+                $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
               end
             end
           end
@@ -1121,10 +1121,10 @@ def watch_nodes
                           @nodeAllocatableCache[key] = nodeAllocatable
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty"
+                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                       end
                     else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty"
+                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["name"]

From 6cf9c1137b5f32207d253ba50ed79de5383f6678 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 17 Jan 2022 21:54:09 -0800
Subject: [PATCH 17/65] move kube perf records to separate plugin

---
 build/linux/installer/conf/kube.conf          |   8 +
 .../installer/datafiles/base_container.data   |   1 +
 kubernetes/linux/main.sh                      |  19 +-
 source/plugins/ruby/in_kube_perfinventory.rb  | 538 ++++++++++++++++++
 source/plugins/ruby/in_kube_podinventory.rb   | 232 ++++----
 5 files changed, 681 insertions(+), 117 deletions(-)
 create mode 100644 source/plugins/ruby/in_kube_perfinventory.rb

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 28a07c223..6f4d91fe6 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -273,6 +273,14 @@
     #  chunk_size_limit 4m
     # </source>
 
+     #Kubernetes perf inventory
+    <source>
+     @type kube_perfinventory
+     tag oneagent.containerInsights.LINUX_PERF_BLOB
+     run_interval 60
+     @log_level debug
+    </source>
+
     #Kubernetes Persistent Volume inventory
     <source>
      @type kube_pvinventory
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index a405e760f..0268499dc 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -162,6 +162,7 @@ MAINTAINER:              'Microsoft Corporation'
 /etc/fluent/plugin/in_containerinventory.rb;			                                     source/plugins/ruby/in_containerinventory.rb;	  	644; root; root
 /etc/fluent/plugin/in_kube_nodes.rb;			                                           source/plugins/ruby/in_kube_nodes.rb;		      	644; root; root
 /etc/fluent/plugin/in_kube_podinventory.rb;			                                     source/plugins/ruby/in_kube_podinventory.rb;			644; root; root
+/etc/fluent/plugin/in_kube_perfinventory.rb;			                                     source/plugins/ruby/in_kube_perfinventory.rb;			644; root; root
 /etc/fluent/plugin/KubernetesApiClient.rb;			                                     source/plugins/ruby/KubernetesApiClient.rb;			644; root; root
 /etc/fluent/plugin/in_kube_events.rb;			                                           source/plugins/ruby/in_kube_events.rb;			      644; root; root
 /etc/fluent/plugin/in_kube_health.rb;			                                           source/plugins/ruby/in_kube_health.rb;			      644; root; root
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 00301a969..80da23d23 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -570,9 +570,26 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then
       # add -T 0xFFFF for full traces
       mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos &
 else
-      echo "starting mdsd mode in main container..."
+      echo "starting mdsd in main container..."
       # add -T 0xFFFF for full traces
       mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null &
+
+      ## TODO- evaluate again multiplace instances of mdsd
+      # echo "starting mdsd tenant instance 2 in main container..."
+      # echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in main container..."
+      # #use tenant name to avoid unix socket conflict and different ports for port conflict
+      # #roleprefix to use container specific mdsd socket
+      # MDSD_INSTANCE_ID="tenant2"
+      # export TENANT_NAME="${MDSD_INSTANCE_ID}"
+      # echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc
+      # export MDSD_ROLE_PREFIX=/var/run/mdsd-${TENANT_NAME}/default
+      # echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc
+      # export MDSD_FLUENT_SOCKET_PORT_TENANT2="26230"
+      # echo "export MDSD_FLUENT_SOCKET_PORT_TENANT2=$MDSD_FLUENT_SOCKET_PORT_TENANT2" >>~/.bashrc
+      # source ~/.bashrc
+      # mkdir /var/run/mdsd-${MDSD_INSTANCE_ID}
+      # # add -T 0xFFFF for full traces
+      # mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd2.err -w ${MDSD_LOG}/mdsd2.warn -o ${MDSD_LOG}/mdsd2.info -q ${MDSD_LOG}/mdsd2.qos &
 fi
 
 # Set up a cron job for logrotation
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
new file mode 100644
index 000000000..888f0db76
--- /dev/null
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -0,0 +1,538 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "fluent/plugin/input"
+
+module Fluent::Plugin
+  class Kube_PerfInventory_Input < Input
+    Fluent::Plugin.register_input("kube_perfinventory", self)
+
+    def initialize
+      super
+      require "yaml"
+      require "yajl/json_gem"
+      require "yajl"
+      require "set"
+      require "time"
+      require "net/http"
+
+      require_relative "KubernetesApiClient"
+      require_relative "ApplicationInsightsUtility"
+      require_relative "oms_common"
+      require_relative "omslog"
+      require_relative "constants"
+      require_relative "extension_utils"
+
+      # refer tomlparser-agent-config for updating defaults
+      # this configurable via configmap
+      @PODS_CHUNK_SIZE = 0
+      @PODS_EMIT_STREAM_BATCH_SIZE = 0
+      @NODES_CHUNK_SIZE = 0
+
+      @watchPodsThread = nil
+      @podItemsCache = {}
+
+      @watchNodesThread = nil
+      @nodeAllocatableCache = {}
+
+      @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
+      @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
+    end
+
+    config_param :run_interval, :time, :default => 60
+    config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB"
+
+    def configure(conf)
+      super
+    end
+
+    def start
+      if @run_interval
+        super
+        if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0
+          @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i
+        else
+          # this shouldnt happen just setting default here as safe guard
+          $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty")
+          @PODS_CHUNK_SIZE = 1000
+        end
+        $log.info("in_kube_perfinventory::start: PODS_CHUNK_SIZE  @ #{@PODS_CHUNK_SIZE}")
+
+        if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0
+          @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i
+        else
+          # this shouldnt happen just setting default here as safe guard
+          $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty")
+          @PODS_EMIT_STREAM_BATCH_SIZE = 200
+        end
+        $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE  @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
+
+        if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0
+          @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i
+        else
+          # this shouldnt happen just setting default here as safe guard
+          $log.warn("in_kube_perfinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty")
+          @NODES_CHUNK_SIZE = 250
+        end
+        $log.info("in_kube_perfinventory::start : NODES_CHUNK_SIZE  @ #{@NODES_CHUNK_SIZE}")
+
+        @finished = false
+        @condition = ConditionVariable.new
+        @mutex = Mutex.new
+        @podCacheMutex = Mutex.new
+        @nodeAllocatableCacheMutex = Mutex.new
+        @thread = Thread.new(&method(:run_periodic))
+        @watchNodesThread = Thread.new(&method(:watch_nodes))
+        @watchPodsThread = Thread.new(&method(:watch_pods))
+      end
+    end
+
+    def shutdown
+      if @run_interval
+        @mutex.synchronize {
+          @finished = true
+          @condition.signal
+        }
+        @thread.join
+        @watchPodsThread.join
+        @watchNodesThread.join
+        super # This super must be at the end of shutdown method
+      end
+    end
+
+    def enumerate(podList = nil)
+      begin
+        podInventory = podList
+        @podCount = 0
+        currentTime = Time.now
+        batchTime = currentTime.utc.iso8601
+        if ExtensionUtils.isAADMSIAuthMode()
+          $log.info("in_kube_perfinventory::enumerate: AAD AUTH MSI MODE")
+          if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
+          end
+          if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+            @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
+          end
+          $log.info("in_kube_perfinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+          $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
+        end
+
+        nodeAllocatableRecords = {}
+        nodeAllocatableCacheSizeKB = 0
+        @nodeAllocatableCacheMutex.synchronize {
+          nodeAllocatableRecords = @nodeAllocatableCache.clone
+        }
+        $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
+        # Initializing continuation token to nil
+        continuationToken = nil
+        podItemsCacheSizeKB = 0
+        podInventory = {}
+        @podCacheMutex.synchronize {
+          podInventory["items"] = @podItemsCache.values.clone
+        }
+        if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+          $log.info("in_kube_perfinventory::enumerate : number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+          parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime)
+        else
+          $log.warn "in_kube_perfinventory::enumerate:Received empty podInventory"
+        end
+        # Setting these to nil so that we dont hold memory until GC kicks in
+        podInventory = nil
+        nodeAllocatableRecords = nil
+      rescue => errorStr
+        $log.warn "in_kube_perfinventory::enumerate:Failed in enumerate: #{errorStr}"
+        $log.debug_backtrace(errorStr.backtrace)
+        ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+      end
+    end
+
+    def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601)
+      currentTime = Time.now
+      emitTime = Fluent::Engine.now
+      kubePerfEventStream = Fluent::MultiEventStream.new
+      insightsMetricsEventStream = Fluent::MultiEventStream.new
+      @@istestvar = ENV["ISTEST"]
+
+      begin #begin block start
+        # # Getting windows nodes from kubeapi
+        # winNodes = KubernetesApiClient.getWindowsNodesArray
+        podInventory["items"].each do |item| #podInventory block start
+          nodeName = ""
+          if !item["spec"]["nodeName"].nil?
+            nodeName = item["spec"]["nodeName"]
+          end
+
+          nodeAllocatableRecord = {}
+          if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
+            nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
+          end
+          #container perf records
+          containerMetricDataItems = []
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
+          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
+
+          containerMetricDataItems.each do |record|
+            kubePerfEventStream.add(emitTime, record) if record
+          end
+
+          if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+            $log.info("in_kube_perfinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+            router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+            if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+              $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+            end
+            kubePerfEventStream = Fluent::MultiEventStream.new
+          end
+
+          # container GPU records
+          containerGPUInsightsMetricsDataItems = []
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+          containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+            insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
+          end
+
+          if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+            $log.info("in_kube_perfinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+            if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+              $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+            end
+            router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+            insightsMetricsEventStream = Fluent::MultiEventStream.new
+          end
+        end  #podInventory block end
+
+        if kubePerfEventStream.count > 0
+          $log.info("in_kube_perfinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+          router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+          kubePerfEventStream = nil
+          if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+            $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+          end
+        end
+
+        if insightsMetricsEventStream.count > 0
+          $log.info("in_kube_perfinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+          router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+          if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+            $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+          end
+          insightsMetricsEventStream = nil
+        end
+      rescue => errorStr
+        $log.warn "Failed in parse_and_emit_record kube perf inventory: #{errorStr}"
+        $log.debug_backtrace(errorStr.backtrace)
+        ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+      end #begin block end
+    end
+
+    def run_periodic
+      @mutex.lock
+      done = @finished
+      @nextTimeToRun = Time.now
+      @waitTimeout = @run_interval
+      until done
+        @nextTimeToRun = @nextTimeToRun + @run_interval
+        @now = Time.now
+        if @nextTimeToRun <= @now
+          @waitTimeout = 1
+          @nextTimeToRun = @now
+        else
+          @waitTimeout = @nextTimeToRun - @now
+        end
+        @condition.wait(@mutex, @waitTimeout)
+        done = @finished
+        @mutex.unlock
+        if !done
+          begin
+            $log.info("in_kube_perfinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}")
+            enumerate
+            $log.info("in_kube_perfinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn "in_kube_perfinventory::run_periodic: enumerate Failed to retrieve perf inventory: #{errorStr}"
+            ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+          end
+        end
+        @mutex.lock
+      end
+      @mutex.unlock
+    end
+
+    def watch_pods
+      $log.info("in_kube_perfinventory::watch_pods:Start @ #{Time.now.utc.iso8601}")
+      podsResourceVersion = nil
+      loop do
+        begin
+          if podsResourceVersion.nil?
+            # clear cache before filling the cache with list
+            @podCacheMutex.synchronize {
+              @podItemsCache.clear()
+            }
+            currentWindowsNodeNameList = []
+            continuationToken = nil
+            $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
+            continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
+            $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
+            if (!podInventory.nil? && !podInventory.empty?)
+              podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+              if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                podInventory["items"].each do |item|
+                  key = item["metadata"]["uid"]
+                  if !key.nil? && !key.empty?
+                    podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                    if !podItem.nil? && !podItem.empty?
+                      @podCacheMutex.synchronize {
+                        @podItemsCache[key] = podItem
+                      }
+                    else
+                      $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
+                    end
+                  else
+                    $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                  end
+                end
+              end
+            else
+              $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory"
+            end
+            while (!continuationToken.nil? && !continuationToken.empty?)
+              continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
+              if (!podInventory.nil? && !podInventory.empty?)
+                podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+                if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                  $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  podInventory["items"].each do |item|
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      if !podItem.nil? && !podItem.empty?
+                        @podCacheMutex.synchronize {
+                          @podItemsCache[key] = podItem
+                        }
+                      else
+                        $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                      end
+                    else
+                      $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                    end
+                  end
+                end
+              else
+                $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
+              end
+            end
+          end
+          begin
+            $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    podsResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    podsResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      if !podItem.nil? && !podItem.empty?
+                        @podCacheMutex.synchronize {
+                          @podItemsCache[key] = podItem
+                        }
+                      else
+                        $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                      end
+                    else
+                      $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["uid"]
+                    if !key.nil? && !key.empty?
+                      @podCacheMutex.synchronize {
+                        @podItemsCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
+                  podsResourceVersion = nil
+                  $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                  break
+                else
+                  $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                end
+              end
+              $log.info("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            end
+          rescue Net::ReadTimeout => errorStr
+            ## This expected if there is no activity more than readtimeout value used in the connection
+            $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            podsResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
+          end
+        rescue => errorStr
+          $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          podsResourceVersion = nil
+        end
+      end
+      $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}")
+    end
+
+    def watch_nodes
+      $log.info("in_kube_perfinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}")
+      nodesResourceVersion = nil
+      loop do
+        begin
+          if nodesResourceVersion.nil?
+            # clear node limits cache before filling the cache with list
+            @nodeAllocatableCacheMutex.synchronize {
+              @nodeAllocatableCache.clear()
+            }
+            continuationToken = nil
+            $log.info("in_kube_perfinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
+            $log.info("in_kube_perfinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            if (!nodeInventory.nil? && !nodeInventory.empty?)
+              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                $log.info("in_kube_perfinventory::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                nodeInventory["items"].each do |item|
+                  key = item["metadata"]["name"]
+                  if !key.nil? && !key.empty?
+                    nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                    if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                      @nodeAllocatableCacheMutex.synchronize {
+                        @nodeAllocatableCache[key] = nodeAllocatable
+                      }
+                    else
+                      $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                    end
+                  else
+                    $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                  end
+                end
+              end
+            else
+              $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+            end
+            while (!continuationToken.nil? && !continuationToken.empty?)
+              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
+              if (!nodeInventory.nil? && !nodeInventory.empty?)
+                nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                  $log.info("in_kube_perfinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  nodeInventory["items"].each do |item|
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                        @nodeAllocatableCacheMutex.synchronize {
+                          @nodeAllocatableCache[key] = nodeAllocatable
+                        }
+                      else
+                        $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    else
+                      $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                    end
+                  end
+                end
+              else
+                $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+              end
+            end
+          end
+          begin
+            $log.info("in_kube_perfinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+            if watcher.nil?
+              $log.warn("in_kube_perfinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            else
+              watcher.each do |notice|
+                case notice["type"]
+                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                  item = notice["object"]
+                  # extract latest resource version to use for watch reconnect
+                  if !item.nil? && !item.empty? &&
+                     !item["metadata"].nil? && !item["metadata"].empty? &&
+                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                    nodesResourceVersion = item["metadata"]["resourceVersion"]
+                    $log.info("in_kube_perfinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                  else
+                    $log.info("in_kube_perfinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    nodesResourceVersion = nil
+                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    break
+                  end
+                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                        @nodeAllocatableCacheMutex.synchronize {
+                          @nodeAllocatableCache[key] = nodeAllocatable
+                        }
+                      else
+                        $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    else
+                      $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                    end
+                  elsif notice["type"] == "DELETED"
+                    key = item["metadata"]["name"]
+                    if !key.nil? && !key.empty?
+                      @nodeAllocatableCacheMutex.synchronize {
+                        @nodeAllocatableCache.delete(key)
+                      }
+                    end
+                  end
+                when "ERROR"
+                  nodesResourceVersion = nil
+                  $log.warn("in_kube_perfinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                  break
+                else
+                  $log.warn("in_kube_perfinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                end
+              end
+            end
+          rescue Net::ReadTimeout => errorStr
+            $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil
+            sleep(5) # do not overwhelm the api-server if api-server broken
+          ensure
+            watcher.finish if watcher
+          end
+        rescue => errorStr
+          $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          nodesResourceVersion = nil
+        end
+      end
+      $log.info("in_kube_perfinventory::watch_nodes:End @ #{Time.now.utc.iso8601}")
+    end
+  end # Kube_Pod_Input
+end # module
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 36d390cb4..bdf0a7f35 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -49,14 +49,14 @@ def initialize
       @serviceItemsCache = {}
 
       @watchNodesThread = nil
-      @nodeAllocatableCache = {}
+      # @nodeAllocatableCache = {}
       @windowsNodeNameListCache = []
       @windowsContainerRecordsCacheSizeBytes = 0
 
-      @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
+      # @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
       @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB"
-      @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
+      # @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
     end
 
     config_param :run_interval, :time, :default => 60
@@ -102,7 +102,7 @@ def start
         @mutex = Mutex.new
         @podCacheMutex = Mutex.new
         @serviceCacheMutex = Mutex.new
-        @nodeAllocatableCacheMutex = Mutex.new
+        # @nodeAllocatableCacheMutex = Mutex.new
         @windowsNodeNameCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
         @watchNodesThread = Thread.new(&method(:watch_nodes))
@@ -143,25 +143,25 @@ def enumerate(podList = nil)
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
-          if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-            @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
-          end
+          # if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+          #   @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
+          # end
           if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE)
           end
           if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE)
           end
-          if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-            @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
-          end
+          # if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
+          #   @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
+          # end
           if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE)
           end
-          $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+          # $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}")
-          $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
+          # $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
         end
 
@@ -178,15 +178,15 @@ def enumerate(podList = nil)
         @serviceCount = serviceRecords.length
         $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}")
 
-        nodeAllocatableRecords = {}
-        nodeAllocatableCacheSizeKB = 0
-        @nodeAllocatableCacheMutex.synchronize {
-          nodeAllocatableRecords = @nodeAllocatableCache.clone
-        }
-        if KubernetesApiClient.isEmitCacheTelemetry()
-          nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024
-        end
-        $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
+        # nodeAllocatableRecords = {}
+        # nodeAllocatableCacheSizeKB = 0
+        # @nodeAllocatableCacheMutex.synchronize {
+        #   nodeAllocatableRecords = @nodeAllocatableCache.clone
+        # }
+        # if KubernetesApiClient.isEmitCacheTelemetry()
+        #   nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024
+        # end
+        # $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
         # to track e2e processing latency
         @podsAPIE2ELatencyMs = 0
         podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
@@ -204,7 +204,7 @@ def enumerate(podList = nil)
         @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime)
         if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
           $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-          parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime)
+          parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime)
         else
           $log.warn "in_kube_podinventory::enumerate:Received empty podInventory"
         end
@@ -212,7 +212,7 @@ def enumerate(podList = nil)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
         serviceRecords = nil
-        nodeAllocatableRecords = nil
+        # nodeAllocatableRecords = nil
 
         # Adding telemetry to send pod telemetry every 5 minutes
         timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -230,7 +230,7 @@ def enumerate(podList = nil)
           if KubernetesApiClient.isEmitCacheTelemetry()
             telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
             telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
-            telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB
+            # telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB
             telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024
           end
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
@@ -253,7 +253,7 @@ def enumerate(podList = nil)
       end
     end
 
-    def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601)
+    def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601)
       currentTime = Time.now
       emitTime = Fluent::Engine.now
       #batchTime = currentTime.utc.iso8601
@@ -306,48 +306,48 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords,
             eventStream = Fluent::MultiEventStream.new
           end
 
-          nodeAllocatableRecord = {}
-          if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
-            nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
-          end
-          #container perf records
-          containerMetricDataItems = []
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
-          containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
+          # nodeAllocatableRecord = {}
+          # if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
+          #   nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
+          # end
+          # #container perf records
+          # containerMetricDataItems = []
+          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
+          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
+          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
+          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
 
-          containerMetricDataItems.each do |record|
-            kubePerfEventStream.add(emitTime, record) if record
-          end
+          # containerMetricDataItems.each do |record|
+          #   kubePerfEventStream.add(emitTime, record) if record
+          # end
 
-          if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
-            router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
-            if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-              $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-            end
-            kubePerfEventStream = Fluent::MultiEventStream.new
-          end
+          # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+          #   $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+          #   router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+          #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+          #     $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+          #   end
+          #   kubePerfEventStream = Fluent::MultiEventStream.new
+          # end
 
-          # container GPU records
-          containerGPUInsightsMetricsDataItems = []
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
-          containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
-          containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
-            insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
-          end
+          # # container GPU records
+          # containerGPUInsightsMetricsDataItems = []
+          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
+          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
+          # containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+          #   insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
+          # end
 
-          if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-            $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
-            if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-              $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-            end
-            router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
-            insightsMetricsEventStream = Fluent::MultiEventStream.new
-          end
+          # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
+          #   $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+          #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+          #     $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+          #   end
+          #   router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+          #   insightsMetricsEventStream = Fluent::MultiEventStream.new
+          # end
         end  #podInventory block end
 
         if eventStream.count > 0
@@ -368,23 +368,23 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords,
           containerInventoryStream = nil
         end
 
-        if kubePerfEventStream.count > 0
-          $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
-          router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
-          kubePerfEventStream = nil
-          if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-            $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-          end
-        end
+        # if kubePerfEventStream.count > 0
+        #   $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
+        #   router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+        #   kubePerfEventStream = nil
+        #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+        #     $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+        #   end
+        # end
 
-        if insightsMetricsEventStream.count > 0
-          $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
-          router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
-          if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-            $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-          end
-          insightsMetricsEventStream = nil
-        end
+        # if insightsMetricsEventStream.count > 0
+        #   $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
+        #   router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
+        #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
+        #     $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+        #   end
+        #   insightsMetricsEventStream = nil
+        # end
 
         if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
           @log.info "Sending pod inventory mdm records to out_mdm"
@@ -1004,10 +1004,10 @@ def watch_nodes
       loop do
         begin
           if nodesResourceVersion.nil?
-            # clear node limits cache before filling the cache with list
-            @nodeAllocatableCacheMutex.synchronize {
-              @nodeAllocatableCache.clear()
-            }
+            # # clear node limits cache before filling the cache with list
+            # @nodeAllocatableCacheMutex.synchronize {
+            #   @nodeAllocatableCache.clear()
+            # }
             @windowsNodeNameCacheMutex.synchronize {
               @windowsNodeNameListCache.clear()
             }
@@ -1023,7 +1023,7 @@ def watch_nodes
                 nodeInventory["items"].each do |item|
                   key = item["metadata"]["name"]
                   if !key.nil? && !key.empty?
-                    nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                    # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
                     isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
                     if isWindowsNodeItem
                       @windowsNodeNameCacheMutex.synchronize {
@@ -1032,13 +1032,13 @@ def watch_nodes
                         end
                       }
                     end
-                    if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                      @nodeAllocatableCacheMutex.synchronize {
-                        @nodeAllocatableCache[key] = nodeAllocatable
-                      }
-                    else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
+                    # if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                    #   @nodeAllocatableCacheMutex.synchronize {
+                    #     @nodeAllocatableCache[key] = nodeAllocatable
+                    #   }
+                    # else
+                    #   $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                    # end
                   else
                     $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                   end
@@ -1056,7 +1056,7 @@ def watch_nodes
                   nodeInventory["items"].each do |item|
                     key = item["metadata"]["name"]
                     if !key.nil? && !key.empty?
-                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                      # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
                       isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
                       if isWindowsNodeItem
                         @windowsNodeNameCacheMutex.synchronize {
@@ -1065,13 +1065,13 @@ def watch_nodes
                           end
                         }
                       end
-                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                        @nodeAllocatableCacheMutex.synchronize {
-                          @nodeAllocatableCache[key] = nodeAllocatable
-                        }
-                      else
-                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                      end
+                      # if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                      #   @nodeAllocatableCacheMutex.synchronize {
+                      #     @nodeAllocatableCache[key] = nodeAllocatable
+                      #   }
+                      # else
+                      #   $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                      # end
                     else
                       $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
@@ -1114,18 +1114,18 @@ def watch_nodes
                         end
                       }
                     end
-                    if !key.nil? && !key.empty?
-                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                        @nodeAllocatableCacheMutex.synchronize {
-                          @nodeAllocatableCache[key] = nodeAllocatable
-                        }
-                      else
-                        $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                      end
-                    else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
+                    # if !key.nil? && !key.empty?
+                    #   nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+                    #   if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+                    #     @nodeAllocatableCacheMutex.synchronize {
+                    #       @nodeAllocatableCache[key] = nodeAllocatable
+                    #     }
+                    #   else
+                    #     $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                    #   end
+                    # else
+                    #   $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                    # end
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["name"]
                     isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
@@ -1134,11 +1134,11 @@ def watch_nodes
                         @windowsNodeNameListCache.delete(key)
                       }
                     end
-                    if !key.nil? && !key.empty?
-                      @nodeAllocatableCacheMutex.synchronize {
-                        @nodeAllocatableCache.delete(key)
-                      }
-                    end
+                    # if !key.nil? && !key.empty?
+                    #   @nodeAllocatableCacheMutex.synchronize {
+                    #     @nodeAllocatableCache.delete(key)
+                    #   }
+                    # end
                   end
                 when "ERROR"
                   nodesResourceVersion = nil

From 9f08cb0c696ff47d42f8a3668b8806385c6dd851 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 18 Jan 2022 21:46:39 -0800
Subject: [PATCH 18/65] refactor

---
 source/plugins/ruby/KubernetesApiClient.rb   | 135 +++++++----
 source/plugins/ruby/in_kube_perfinventory.rb |   7 +-
 source/plugins/ruby/in_kube_podinventory.rb  | 226 ++++---------------
 3 files changed, 140 insertions(+), 228 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 1a7444b28..f1afd4ac6 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -878,6 +878,8 @@ def getOptimizedItem(resource, resourceItem, isWindowsItem = false)
       case resource
       when "pods"
         return getPodOptimizedItem(resourceItem, isWindowsItem)
+      when "pods-perf"
+        return getPodPerfOptimizedItem(resourceItem)
       when "nodes"
         return getNodeOptimizedItem(resourceItem)
       when "services"
@@ -936,21 +938,58 @@ def isWindowsNodeItem(nodeResourceItem)
       return isWindowsNodeItem
     end
 
-    # def isWindowsPodItem(podItem)
-    #   isWindowsPod = false
-    #   begin
-    #     winNodes = KubernetesApiClient.getWindowsNodesArray()
-    #     if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0
-    #       nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : ""
-    #       if !nodeName.empty? && winNodes.include?(nodeName)
-    #         isWindowsPod = true
-    #       end
-    #     end
-    #   rescue => errorStr
-    #     $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}"
-    #   end
-    #   return isWindowsPod
-    # end
+    def getPodPerfOptimizedItem(resourceItem)
+      item = {}
+      begin
+        item["metadata"] = {}
+        if !resourceItem["metadata"].nil?
+          if !resourceItem["metadata"]["annotations"].nil?
+            item["metadata"]["annotations"] = {}
+            item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"]
+          end
+
+          if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0
+            item["metadata"]["ownerReferences"] = []
+            ownerReference = {}
+            ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"]
+            ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"]
+            item["metadata"]["ownerReferences"].push(ownerReference)
+          end
+          item["metadata"]["name"] = resourceItem["metadata"]["name"]
+          item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"]
+          item["metadata"]["uid"] = resourceItem["metadata"]["uid"]
+        end
+
+        item["spec"] = {}
+        if !resourceItem["spec"].nil?
+          item["spec"]["containers"] = []
+          if !resourceItem["spec"]["containers"].nil?
+            resourceItem["spec"]["containers"].each do |container|
+              currentContainer = {}
+              currentContainer["name"] = container["name"]
+              currentContainer["resources"] = container["resources"]
+              item["spec"]["containers"].push(currentContainer)
+            end
+          end
+          item["spec"]["initContainers"] = []
+          if !resourceItem["spec"]["initContainers"].nil?
+            resourceItem["spec"]["initContainers"].each do |container|
+              currentContainer = {}
+              currentContainer["name"] = container["name"]
+              currentContainer["resources"] = container["resources"]
+              item["spec"]["initContainers"].push(currentContainer)
+            end
+          end
+          item["spec"]["nodeName"] = ""
+          if !resourceItem["spec"]["nodeName"].nil?
+            item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"]
+          end
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getPodPerfOptimizedItem:Failed with an error : #{errorStr}"
+      end
+      return item
+    end
 
     def getPodOptimizedItem(resourceItem, isWindowsPodItem)
       item = {}
@@ -983,54 +1022,60 @@ def getPodOptimizedItem(resourceItem, isWindowsPodItem)
         item["spec"] = {}
         if !resourceItem["spec"].nil?
           item["spec"]["containers"] = []
+          item["spec"]["initContainers"] = []
           isDisableClusterCollectEnvVar = false
           clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"]
           if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0
             isDisableClusterCollectEnvVar = true
           end
-          if !resourceItem["spec"]["containers"].nil?
-            resourceItem["spec"]["containers"].each do |container|
-              currentContainer = {}
-              currentContainer["name"] = container["name"]
-              currentContainer["resources"] = container["resources"]
-              # fields required for windows containers records
-              if isWindowsPodItem
-                currentContainer["image"] = container["image"]
-                currentContainer["ports"] = container["ports"]
-                currentContainer["command"] = container["command"]
-                currentContainer["env"] = ""
-                if !isDisableClusterCollectEnvVar
-                  currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+
+          # container spec required only for windows container inventory records
+          if isWindowsPodItem
+            if !resourceItem["spec"]["containers"].nil?
+              resourceItem["spec"]["containers"].each do |container|
+                currentContainer = {}
+                currentContainer["name"] = container["name"]
+                currentContainer["resources"] = container["resources"]
+                # fields required for windows containers records
+                if isWindowsPodItem
+                  currentContainer["image"] = container["image"]
+                  currentContainer["ports"] = container["ports"]
+                  currentContainer["command"] = container["command"]
+                  currentContainer["env"] = ""
+                  if !isDisableClusterCollectEnvVar
+                    currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+                  end
                 end
+                item["spec"]["containers"].push(currentContainer)
               end
-              item["spec"]["containers"].push(currentContainer)
             end
-          end
-          item["spec"]["initContainers"] = []
-          if !resourceItem["spec"]["initContainers"].nil?
-            resourceItem["spec"]["initContainers"].each do |container|
-              currentContainer = {}
-              currentContainer["name"] = container["name"]
-              currentContainer["resources"] = container["resources"]
-              # fields required for windows containers records
-              if isWindowsPodItem
-                currentContainer["image"] = container["image"]
-                currentContainer["ports"] = container["ports"]
-                currentContainer["command"] = container["command"]
-                currentContainer["env"] = ""
-                if !isDisableClusterCollectEnvVar
-                  currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+            if !resourceItem["spec"]["initContainers"].nil?
+              resourceItem["spec"]["initContainers"].each do |container|
+                currentContainer = {}
+                currentContainer["name"] = container["name"]
+                currentContainer["resources"] = container["resources"]
+                # fields required for windows containers records
+                if isWindowsPodItem
+                  currentContainer["image"] = container["image"]
+                  currentContainer["ports"] = container["ports"]
+                  currentContainer["command"] = container["command"]
+                  currentContainer["env"] = ""
+                  if !isDisableClusterCollectEnvVar
+                    currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container)
+                  end
                 end
+                item["spec"]["initContainers"].push(currentContainer)
               end
-              item["spec"]["initContainers"].push(currentContainer)
             end
           end
+
           item["spec"]["nodeName"] = ""
           if !resourceItem["spec"]["nodeName"].nil?
             item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"]
           end
         end
         item["status"] = {}
+
         if !resourceItem["status"].nil?
           if !resourceItem["status"]["startTime"].nil?
             item["status"]["startTime"] = resourceItem["status"]["startTime"]
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 888f0db76..7403b86f3 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -156,7 +156,6 @@ def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationTok
 
       begin #begin block start
         # # Getting windows nodes from kubeapi
-        # winNodes = KubernetesApiClient.getWindowsNodesArray
         podInventory["items"].each do |item| #podInventory block start
           nodeName = ""
           if !item["spec"]["nodeName"].nil?
@@ -285,7 +284,7 @@ def watch_pods
                 podInventory["items"].each do |item|
                   key = item["metadata"]["uid"]
                   if !key.nil? && !key.empty?
-                    podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                    podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
                     if !podItem.nil? && !podItem.empty?
                       @podCacheMutex.synchronize {
                         @podItemsCache[key] = podItem
@@ -310,7 +309,7 @@ def watch_pods
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem
@@ -353,7 +352,7 @@ def watch_pods
                   if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item)
+                      podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
                       if !podItem.nil? && !podItem.empty?
                         @podCacheMutex.synchronize {
                           @podItemsCache[key] = podItem
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index bdf0a7f35..d466a7637 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -49,14 +49,11 @@ def initialize
       @serviceItemsCache = {}
 
       @watchNodesThread = nil
-      # @nodeAllocatableCache = {}
       @windowsNodeNameListCache = []
       @windowsContainerRecordsCacheSizeBytes = 0
 
-      # @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB"
       @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB"
-      # @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
     end
 
     config_param :run_interval, :time, :default => 60
@@ -102,10 +99,9 @@ def start
         @mutex = Mutex.new
         @podCacheMutex = Mutex.new
         @serviceCacheMutex = Mutex.new
-        # @nodeAllocatableCacheMutex = Mutex.new
         @windowsNodeNameCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
-        @watchNodesThread = Thread.new(&method(:watch_nodes))
+        @watchNodesThread = Thread.new(&method(:watch_windows_nodes))
         @watchPodsThread = Thread.new(&method(:watch_pods))
         @watchServicesThread = Thread.new(&method(:watch_services))
         @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -143,25 +139,18 @@ def enumerate(podList = nil)
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
-          # if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-          #   @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE)
-          # end
           if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE)
           end
           if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE)
           end
-          # if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
-          #   @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE)
-          # end
           if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX)
             @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE)
           end
-          # $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}")
+
           $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}")
-          # $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
           $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}")
         end
 
@@ -178,16 +167,6 @@ def enumerate(podList = nil)
         @serviceCount = serviceRecords.length
         $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}")
 
-        # nodeAllocatableRecords = {}
-        # nodeAllocatableCacheSizeKB = 0
-        # @nodeAllocatableCacheMutex.synchronize {
-        #   nodeAllocatableRecords = @nodeAllocatableCache.clone
-        # }
-        # if KubernetesApiClient.isEmitCacheTelemetry()
-        #   nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024
-        # end
-        # $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
-        # to track e2e processing latency
         @podsAPIE2ELatencyMs = 0
         podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i
         # Initializing continuation token to nil
@@ -212,7 +191,6 @@ def enumerate(podList = nil)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
         serviceRecords = nil
-        # nodeAllocatableRecords = nil
 
         # Adding telemetry to send pod telemetry every 5 minutes
         timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -230,7 +208,6 @@ def enumerate(podList = nil)
           if KubernetesApiClient.isEmitCacheTelemetry()
             telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB
             telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB
-            # telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB
             telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024
           end
           ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
@@ -264,8 +241,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
       @@istestvar = ENV["ISTEST"]
 
       begin #begin block start
-        # # Getting windows nodes from kubeapi
-        # winNodes = KubernetesApiClient.getWindowsNodesArray
         podInventory["items"].each do |item| #podInventory block start
           # pod inventory records
           podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime)
@@ -305,49 +280,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
             router.emit_stream(@tag, eventStream) if eventStream
             eventStream = Fluent::MultiEventStream.new
           end
-
-          # nodeAllocatableRecord = {}
-          # if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName)
-          #   nodeAllocatableRecord = nodeAllocatableRecords[nodeName]
-          # end
-          # #container perf records
-          # containerMetricDataItems = []
-          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime))
-          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime))
-          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime))
-          # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime))
-
-          # containerMetricDataItems.each do |record|
-          #   kubePerfEventStream.add(emitTime, record) if record
-          # end
-
-          # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-          #   $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
-          #   router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
-          #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-          #     $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-          #   end
-          #   kubePerfEventStream = Fluent::MultiEventStream.new
-          # end
-
-          # # container GPU records
-          # containerGPUInsightsMetricsDataItems = []
-          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
-          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
-          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime))
-          # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime))
-          # containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
-          #   insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord
-          # end
-
-          # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE
-          #   $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
-          #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-          #     $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-          #   end
-          #   router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
-          #   insightsMetricsEventStream = Fluent::MultiEventStream.new
-          # end
         end  #podInventory block end
 
         if eventStream.count > 0
@@ -368,24 +300,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           containerInventoryStream = nil
         end
 
-        # if kubePerfEventStream.count > 0
-        #   $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}")
-        #   router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
-        #   kubePerfEventStream = nil
-        #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-        #     $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-        #   end
-        # end
-
-        # if insightsMetricsEventStream.count > 0
-        #   $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}")
-        #   router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream
-        #   if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0)
-        #     $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
-        #   end
-        #   insightsMetricsEventStream = nil
-        # end
-
         if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
           @log.info "Sending pod inventory mdm records to out_mdm"
           pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
@@ -746,7 +660,10 @@ def watch_pods
                   if !key.nil? && !key.empty?
                     nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
                     isWindowsPodItem = false
-                    if !nodeName.empty? && !currentWindowsNodeNameList.nil? && !currentWindowsNodeNameList.empty? && currentWindowsNodeNameList.include?(nodeName)
+                    if !nodeName.empty? &&
+                       !currentWindowsNodeNameList.nil? &&
+                       !currentWindowsNodeNameList.empty? &&
+                       currentWindowsNodeNameList.include?(nodeName)
                       isWindowsPodItem = true
                     end
                     podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
@@ -998,95 +915,69 @@ def watch_services
       $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}")
     end
 
-    def watch_nodes
-      $log.info("in_kube_podinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}")
+    def watch_windows_nodes
+      $log.info("in_kube_podinventory::watch_windows_nodes:Start @ #{Time.now.utc.iso8601}")
       nodesResourceVersion = nil
       loop do
         begin
           if nodesResourceVersion.nil?
-            # # clear node limits cache before filling the cache with list
-            # @nodeAllocatableCacheMutex.synchronize {
-            #   @nodeAllocatableCache.clear()
-            # }
             @windowsNodeNameCacheMutex.synchronize {
               @windowsNodeNameListCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+            $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}")
             continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-            $log.info("in_kube_podinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
+            $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}")
             if (!nodeInventory.nil? && !nodeInventory.empty?)
               nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
               if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                $log.info("in_kube_podinventory::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                 nodeInventory["items"].each do |item|
                   key = item["metadata"]["name"]
                   if !key.nil? && !key.empty?
-                    # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
-                    if isWindowsNodeItem
-                      @windowsNodeNameCacheMutex.synchronize {
-                        if !@windowsNodeNameListCache.include?(key)
-                          @windowsNodeNameListCache.push(key)
-                        end
-                      }
-                    end
-                    # if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                    #   @nodeAllocatableCacheMutex.synchronize {
-                    #     @nodeAllocatableCache[key] = nodeAllocatable
-                    #   }
-                    # else
-                    #   $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                    # end
+                    @windowsNodeNameCacheMutex.synchronize {
+                      if !@windowsNodeNameListCache.include?(key)
+                        @windowsNodeNameListCache.push(key)
+                      end
+                    }
                   else
-                    $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                    $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                   end
                 end
               end
             else
-              $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+              $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
             end
             while (!continuationToken.nil? && !continuationToken.empty?)
               continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
               if (!nodeInventory.nil? && !nodeInventory.empty?)
                 nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
                 if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
                   nodeInventory["items"].each do |item|
                     key = item["metadata"]["name"]
                     if !key.nil? && !key.empty?
-                      # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                      isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
-                      if isWindowsNodeItem
-                        @windowsNodeNameCacheMutex.synchronize {
-                          if !@windowsNodeNameListCache.include?(key)
-                            @windowsNodeNameListCache.push(key)
-                          end
-                        }
-                      end
-                      # if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                      #   @nodeAllocatableCacheMutex.synchronize {
-                      #     @nodeAllocatableCache[key] = nodeAllocatable
-                      #   }
-                      # else
-                      #   $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                      # end
+                      @windowsNodeNameCacheMutex.synchronize {
+                        if !@windowsNodeNameListCache.include?(key)
+                          @windowsNodeNameListCache.push(key)
+                        end
+                      }
                     else
-                      $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                      $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
               else
-                $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+                $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
               end
             end
           end
           begin
-            $log.info("in_kube_podinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+            $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+            watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
             if watcher.nil?
-              $log.warn("in_kube_podinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+              $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
             else
               watcher.each do |notice|
                 case notice["type"]
@@ -1097,73 +988,50 @@ def watch_nodes
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_podinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
-                    $log.info("in_kube_podinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    $log.info("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     nodesResourceVersion = nil
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
                   end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                  if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need name
                     key = item["metadata"]["name"]
-                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
-                    if isWindowsNodeItem
-                      @windowsNodeNameCacheMutex.synchronize {
-                        if !@windowsNodeNameListCache.include?(key)
-                          @windowsNodeNameListCache.push(key)
-                        end
-                      }
-                    end
-                    # if !key.nil? && !key.empty?
-                    #   nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                    #   if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                    #     @nodeAllocatableCacheMutex.synchronize {
-                    #       @nodeAllocatableCache[key] = nodeAllocatable
-                    #     }
-                    #   else
-                    #     $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                    #   end
-                    # else
-                    #   $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                    # end
+                    @windowsNodeNameCacheMutex.synchronize {
+                      if !@windowsNodeNameListCache.include?(key)
+                        @windowsNodeNameListCache.push(key)
+                      end
+                    }
                   elsif notice["type"] == "DELETED"
                     key = item["metadata"]["name"]
-                    isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item)
-                    if isWindowsNodeItem
-                      @windowsNodeNameCacheMutex.synchronize {
-                        @windowsNodeNameListCache.delete(key)
-                      }
-                    end
-                    # if !key.nil? && !key.empty?
-                    #   @nodeAllocatableCacheMutex.synchronize {
-                    #     @nodeAllocatableCache.delete(key)
-                    #   }
-                    # end
+                    @windowsNodeNameCacheMutex.synchronize {
+                      @windowsNodeNameListCache.delete(key)
+                    }
                   end
                 when "ERROR"
                   nodesResourceVersion = nil
-                  $log.warn("in_kube_podinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                  $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
                   break
                 else
-                  $log.warn("in_kube_podinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                  $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
             end
           rescue Net::ReadTimeout => errorStr
-            $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
-            $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             nodesResourceVersion = nil
             sleep(5) # do not overwhelm the api-server if api-server broken
           ensure
             watcher.finish if watcher
           end
         rescue => errorStr
-          $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+          $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           nodesResourceVersion = nil
         end
       end
-      $log.info("in_kube_podinventory::watch_nodes:End @ #{Time.now.utc.iso8601}")
+      $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}")
     end
   end # Kube_Pod_Input
 end # module

From 056ea8b4b1d1a77742cfe6625ef65f5b13049171 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 19 Jan 2022 11:17:36 -0800
Subject: [PATCH 19/65] minor update

---
 kubernetes/omsagent.yaml                    | 9 ++++++---
 source/plugins/ruby/in_kube_podinventory.rb | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 8a4532035..ea282929a 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -615,10 +615,13 @@ spec:
               cpu: 150m
               memory: 250Mi
           env:
-            - name: EMIT_CACHE_TELEMETRY
-              value: "true"
             - name: NUM_OF_FLUENTD_WORKERS
-              value: "4" # This value should be same as number of CPU cores specified under limits
+              valueFrom:
+                resourceFieldRef:
+                  containerName: omsagent
+                  resource: limits.cpu
+            - name: EMIT_CACHE_TELEMETRY
+              value: "true" # enable only debug or test purpose and disable for prod
             - name: AKS_RESOURCE_ID
               value: "VALUE_AKS_RESOURCE_ID_VALUE"
             - name: AKS_REGION
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index d466a7637..f0ddac0b8 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -995,7 +995,7 @@ def watch_windows_nodes
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
                   end
-                  if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need name
+                  if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name
                     key = item["metadata"]["name"]
                     @windowsNodeNameCacheMutex.synchronize {
                       if !@windowsNodeNameListCache.include?(key)

From b940e454573e56cfbff78563c76d44c9725f58ad Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 2 Feb 2022 11:20:55 -0800
Subject: [PATCH 20/65] remove commented code

---
 kubernetes/linux/main.sh | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index bea27379c..048dfcaa0 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -578,23 +578,6 @@ else
       echo "starting mdsd in main container..."
       # add -T 0xFFFF for full traces
       mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null &
-
-      ## TODO- evaluate again multiplace instances of mdsd
-      # echo "starting mdsd tenant instance 2 in main container..."
-      # echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in main container..."
-      # #use tenant name to avoid unix socket conflict and different ports for port conflict
-      # #roleprefix to use container specific mdsd socket
-      # MDSD_INSTANCE_ID="tenant2"
-      # export TENANT_NAME="${MDSD_INSTANCE_ID}"
-      # echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc
-      # export MDSD_ROLE_PREFIX=/var/run/mdsd-${TENANT_NAME}/default
-      # echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc
-      # export MDSD_FLUENT_SOCKET_PORT_TENANT2="26230"
-      # echo "export MDSD_FLUENT_SOCKET_PORT_TENANT2=$MDSD_FLUENT_SOCKET_PORT_TENANT2" >>~/.bashrc
-      # source ~/.bashrc
-      # mkdir /var/run/mdsd-${MDSD_INSTANCE_ID}
-      # # add -T 0xFFFF for full traces
-      # mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd2.err -w ${MDSD_LOG}/mdsd2.warn -o ${MDSD_LOG}/mdsd2.info -q ${MDSD_LOG}/mdsd2.qos &
 fi
 
 # Set up a cron job for logrotation

From 3a0cff2d5f34becee7a07d67b08737db47add8bb Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 5 Feb 2022 10:42:03 -0800
Subject: [PATCH 21/65] mdm state file

---
 source/plugins/ruby/constants.rb            |  4 ++
 source/plugins/ruby/in_kube_podinventory.rb | 48 ++++++++++++++++-----
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index b9516c2ce..0b16e82f8 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -136,4 +136,8 @@ class Constants
   #This is for telemetry to track if any of the windows customer has any of the field size >= 64KB
   #To evaluate switching to Windows AMA 64KB impacts any existing customers
   MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536
+
+  # FileName for MDM POD Inventory records
+  MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
+  MDM_POD_INVENTORY_STATE_TEMP_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryStateTemp.json"
 end
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index d6803fe3d..d7e1063ad 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -20,6 +20,7 @@ def initialize
       require "set"
       require "time"
       require "net/http"
+      require "fileutils"
 
       require_relative "kubernetes_container_inventory"
       require_relative "KubernetesApiClient"
@@ -150,6 +151,7 @@ def enumerate(podList = nil)
         batchTime = currentTime.utc.iso8601
         serviceRecords = []
         @podInventoryE2EProcessingLatencyMs = 0
+        @mdmPodRecords = {}
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
@@ -212,6 +214,7 @@ def enumerate(podList = nil)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
         serviceRecords = nil
+        @mdmPodRecords = nil
 
         # Adding telemetry to send pod telemetry every 5 minutes
         timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -239,7 +242,7 @@ def enumerate(podList = nil)
           ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties)
           if @winContainerCount > 0
             telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount
-            telemetryProperties["WindowsNodeCount"] = @windowsNodeCount
+            telemetryProperties["WindowsNodeCount"] = @windowsNodeNameListCache.length
             telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024
             telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore
             if @winContainerCountWithEnvVarSize64KBOrMore > 0
@@ -298,7 +301,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
             if KubernetesApiClient.isEmitCacheTelemetry()
               @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length
             end
-            @windowsNodeCount = winNodes.length
             # Send container inventory records for containers on windows nodes
             @winContainerCount += containerInventoryRecords.length
             containerInventoryRecords.each do |cirecord|
@@ -351,14 +353,11 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
         end
 
         if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
-          @log.info "Sending pod inventory mdm records to out_mdm"
-          pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
-          @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
-          mdm_pod_inventory_es = Fluent::MultiEventStream.new
-          pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
-            mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
-          } if pod_inventory_mdm_records
-          router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
+          if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
+            mdmPodRecordsJson = @mdmPodRecords.to_s
+            @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
+            atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson)
+          end
         end
 
         if continuationToken.nil? # sending kube services inventory records
@@ -437,6 +436,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
       record = {}
 
       begin
+        mdmPodRecord = {}
         record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated
         record["Name"] = item["metadata"]["name"]
         podNameSpace = item["metadata"]["namespace"]
@@ -512,7 +512,13 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
         record["PodRestartCount"] = 0
 
         #Invoke the helper method to compute ready/not ready mdm metric
-        @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
+        mdmPodRecord["PodUid"] = podUid
+        mdmPodRecord["ControllerName"] = record["ControllerName"]
+        mdmPodRecord["Namespace"] = record["Namespace"]
+        mdmPodRecord["status"] = {}
+        mdmPodRecord["status"]["conditions"] = item["status"]["conditions"]
+        mdmPodRecord["containeRecords"] = []
+        #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
 
         podContainers = []
         if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty?
@@ -549,6 +555,13 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
             record["ContainerRestartCount"] = containerRestartCount
 
             containerStatus = container["state"]
+
+            mdmContainerRecord = {}
+            mdmContainerRecord["state"] = containerStatus
+            mdmContainerRecord["restartCount"] = containerRestartCount
+            mdmContainerRecord["lastState"] = container["lastState"]
+            mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup)
+
             record["ContainerStatusReason"] = ""
             # state is of the following form , so just picking up the first key name
             # "state": {
@@ -629,6 +642,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
           records.push(record)
         end  #container status block end
 
+        @mdmPodRecords[podUid] = mdmPodRecord
+
         records.each do |record|
           if !record.nil?
             record["PodRestartCount"] = podRestartCount
@@ -1083,5 +1098,16 @@ def watch_windows_nodes
       end
       $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}")
     end
+
+    def atomic_file_write(path, temp_path, content)
+      begin
+        File.open(temp_path, "w+") do |f|
+          f.write(content)
+        end
+        FileUtils.mv(temp_path, path)
+      rescue => err
+        $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}"
+      end
+    end
   end # Kube_Pod_Input
 end # module

From a0e4498bdc559e9bffa03485963fba188bb51db4 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 5 Feb 2022 18:47:38 -0800
Subject: [PATCH 22/65] mdm state file

---
 source/plugins/ruby/in_kube_podinventory.rb | 44 ++++++++++++++++-----
 source/plugins/ruby/podinventory_to_mdm.rb  | 13 +-----
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index d7e1063ad..9f8865e95 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -151,7 +151,7 @@ def enumerate(podList = nil)
         batchTime = currentTime.utc.iso8601
         serviceRecords = []
         @podInventoryE2EProcessingLatencyMs = 0
-        @mdmPodRecords = {}
+        @mdmPodRecords = []
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
@@ -354,7 +354,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
 
         if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
           if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
-            mdmPodRecordsJson = @mdmPodRecords.to_s
+            mdmPodRecordsJson = @mdmPodRecords.to_json
             @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
             atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson)
           end
@@ -513,10 +513,12 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
 
         #Invoke the helper method to compute ready/not ready mdm metric
         mdmPodRecord["PodUid"] = podUid
+        mdmPodRecord["Computer"] = nodeName
         mdmPodRecord["ControllerName"] = record["ControllerName"]
         mdmPodRecord["Namespace"] = record["Namespace"]
-        mdmPodRecord["status"] = {}
-        mdmPodRecord["status"]["conditions"] = item["status"]["conditions"]
+        mdmPodRecord["PodStatus"] = record["PodStatus"]
+        mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"])
+        mdmPodRecord["ControllerKind"] = record["ControllerKind"]
         mdmPodRecord["containeRecords"] = []
         #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
 
@@ -557,11 +559,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
             containerStatus = container["state"]
 
             mdmContainerRecord = {}
-            mdmContainerRecord["state"] = containerStatus
-            mdmContainerRecord["restartCount"] = containerRestartCount
-            mdmContainerRecord["lastState"] = container["lastState"]
-            mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup)
-
             record["ContainerStatusReason"] = ""
             # state is of the following form , so just picking up the first key name
             # "state": {
@@ -586,6 +583,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
               end
               # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm
               if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB
+                mdmContainerRecord["state"] = containerStatus
                 @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus)
               end
             end
@@ -614,6 +612,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
 
                   #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
                   if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+                    mdmContainerRecord["lastState"] = container["lastState"]
                     @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
                   end
                   lastStateReason = nil
@@ -626,6 +625,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
 
               #Populate mdm metric for container restart count if greater than 0
               if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+                mdmContainerRecord["restartCount"] = containerRestartCount
+                mdmContainerRecord["lastState"] = container["lastState"]
                 @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
               end
             rescue => errorStr
@@ -635,6 +636,10 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
               record["ContainerLastStatus"] = Hash.new
             end
 
+            if !mdmContainerRecord.empty?
+              mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup)
+            end
+
             podRestartCount += containerRestartCount
             records.push(record.dup)
           end
@@ -642,7 +647,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
           records.push(record)
         end  #container status block end
 
-        @mdmPodRecords[podUid] = mdmPodRecord
+        @mdmPodRecords.push(mdmPodRecord.dup)
 
         records.each do |record|
           if !record.nil?
@@ -1109,5 +1114,24 @@ def atomic_file_write(path, temp_path, content)
         $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}"
       end
     end
+
+    def getPodReadyCondition(podStatusConditions)
+      podReadyCondition = false
+      begin
+        if !podStatusConditions.nil? && !podStatusConditions.empty?
+          podStatusConditions.each do |condition|
+            if condition["type"] == "Ready"
+              if condition["status"].downcase == "true"
+                podReadyCondition = true
+              end
+              break #Exit the for loop since we found the ready condition
+            end
+          end
+        end
+      rescue => err
+        $log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}"
+      end
+      return podReadyCondition
+    end
   end # Kube_Pod_Input
 end # module
diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb
index c24a91a87..278632cb0 100644
--- a/source/plugins/ruby/podinventory_to_mdm.rb
+++ b/source/plugins/ruby/podinventory_to_mdm.rb
@@ -218,24 +218,13 @@ def process_record_for_container_restarts_metric(podControllerNameDimValue, podN
     end
   end
 
-  def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podStatusConditions)
+  def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition)
     if @process_incoming_stream
       begin
         @log.info "in process_record_for_pods_ready_metric..."
         if podControllerNameDimValue.nil? || podControllerNameDimValue.empty?
           podControllerNameDimValue = "No Controller"
         end
-        podReadyCondition = false
-        if !podStatusConditions.nil? && !podStatusConditions.empty?
-          podStatusConditions.each do |condition|
-            if condition["type"] == "Ready"
-              if condition["status"].downcase == "true"
-                podReadyCondition = true
-              end
-              break #Exit the for loop since we found the ready condition
-            end
-          end
-        end
         MdmMetricsGenerator.generatePodReadyMetrics(podControllerNameDimValue,
                                                     podNamespaceDimValue, podReadyCondition)
       rescue => errorStr

From de4f4b5d1245fec50c13d9ca804167d2ef5b70f1 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 6 Feb 2022 09:18:06 -0800
Subject: [PATCH 23/65] podmdm to separate plugin

---
 build/linux/installer/conf/kube.conf          |  49 ++---
 kubernetes/linux/main.sh                      |  12 ++
 source/plugins/ruby/in_kube_podinventory.rb   |   9 -
 .../plugins/ruby/in_kube_podmdminventory.rb   | 168 ++++++++++++++++++
 4 files changed, 208 insertions(+), 30 deletions(-)
 create mode 100644 source/plugins/ruby/in_kube_podmdminventory.rb

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 6f4d91fe6..c3cbe95b1 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -124,25 +124,6 @@
      </buffer>
      keepalive true
     </match>
-
-    <match mdm.kubepodinventory**>
-     @type mdm
-     @id out_mdm_podinventory
-     @log_level debug
-     <buffer>
-      @type file
-      path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
-      overflow_action drop_oldest_chunk
-      chunk_limit_size 4m
-      queue_limit_length 20
-      flush_interval 20s
-      retry_max_times 10
-      retry_wait 5s
-      retry_max_interval 5m
-      flush_thread_count 5
-     </buffer>
-     retry_mdm_post_wait_minutes 30
-    </match>
   </worker>
   <worker "#{ENV['FLUENTD_NODE_INVENTORY_WORKER_ID']}">
     #Kubernetes Nodes
@@ -264,6 +245,33 @@
      keepalive true
     </match>
   </worker>
+  <worker "#{ENV['FLUENTD_POD_MDM_INVENTORY_WORKER_ID']}">
+    #Kubernetes podmdm inventory
+    <source>
+     @type kube_pdmdminventory
+     run_interval 60
+     @log_level debug
+    </source>
+
+    <match mdm.kubepodinventory**>
+     @type mdm
+     @id out_mdm_podinventory
+     @log_level debug
+     <buffer>
+      @type file
+      path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
+      overflow_action drop_oldest_chunk
+      chunk_limit_size 4m
+      queue_limit_length 20
+      flush_interval 20s
+      retry_max_times 10
+      retry_wait 5s
+      retry_max_interval 5m
+      flush_thread_count 5
+     </buffer>
+     retry_mdm_post_wait_minutes 30
+    </match>
+  </worker>
   <worker "#{ENV['FLUENTD_OTHER_INVENTORY_WORKER_ID']}">
     #fluent forward plugin
     # <source>
@@ -272,8 +280,7 @@
     #  bind 0.0.0.0
     #  chunk_size_limit 4m
     # </source>
-
-     #Kubernetes perf inventory
+    #Kubernetes perf inventory
     <source>
      @type kube_perfinventory
      tag oneagent.containerInsights.LINUX_PERF_BLOB
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 048dfcaa0..c04fd8eac 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -593,17 +593,27 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
             case $NUM_OF_FLUENTD_WORKERS in
+            5)
+                  export NUM_OF_FLUENTD_WORKERS=5
+                  export FLUENTD_POD_INVENTORY_WORKER_ID=4
+                  export FLUENTD_NODE_INVENTORY_WORKER_ID=3
+                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
+                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
+                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+                  ;;
             4)
                   export NUM_OF_FLUENTD_WORKERS=4
                   export FLUENTD_POD_INVENTORY_WORKER_ID=3
                   export FLUENTD_NODE_INVENTORY_WORKER_ID=2
                   export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
+                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
                   export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
             3)
                   export NUM_OF_FLUENTD_WORKERS=3
                   export FLUENTD_POD_INVENTORY_WORKER_ID=2
                   export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
                   export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
                   export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
@@ -611,6 +621,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
                  export NUM_OF_FLUENTD_WORKERS=2
                  export FLUENTD_POD_INVENTORY_WORKER_ID=1
                  export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+                 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
@@ -620,6 +631,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
                  export FLUENTD_POD_INVENTORY_WORKER_ID=0
                  export FLUENTD_NODE_INVENTORY_WORKER_ID=0
                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+                 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
                   ;;
             esac
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 9f8865e95..6369471f4 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -4,12 +4,9 @@
 require "fluent/plugin/input"
 
 module Fluent::Plugin
-  require_relative "podinventory_to_mdm"
-
   class Kube_PodInventory_Input < Input
     Fluent::Plugin.register_input("kube_podinventory", self)
 
-    @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
     @@hostName = (OMS::Common.get_hostname)
 
     def initialize
@@ -69,7 +66,6 @@ def initialize
 
     def configure(conf)
       super
-      @inventoryToMdmConvertor = Inventory2MdmConvertor.new()
     end
 
     def start
@@ -285,7 +281,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           podInventoryRecords.each do |record|
             if !record.nil?
               eventStream.add(emitTime, record) if record
-              @inventoryToMdmConvertor.process_pod_inventory_record(record)
             end
           end
           # Setting this flag to true so that we can send ContainerInventory records for containers
@@ -520,7 +515,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
         mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"])
         mdmPodRecord["ControllerKind"] = record["ControllerKind"]
         mdmPodRecord["containeRecords"] = []
-        #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"])
 
         podContainers = []
         if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty?
@@ -584,7 +578,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
               # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm
               if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB
                 mdmContainerRecord["state"] = containerStatus
-                @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus)
               end
             end
 
@@ -613,7 +606,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
                   #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
                   if lastStateReason.downcase == Constants::REASON_OOM_KILLED
                     mdmContainerRecord["lastState"] = container["lastState"]
-                    @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
                   end
                   lastStateReason = nil
                 else
@@ -627,7 +619,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
               if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
                 mdmContainerRecord["restartCount"] = containerRestartCount
                 mdmContainerRecord["lastState"] = container["lastState"]
-                @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
               end
             rescue => errorStr
               $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}"
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
new file mode 100644
index 000000000..9432e4fe0
--- /dev/null
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -0,0 +1,168 @@
+#!/usr/local/bin/ruby
+# frozen_string_literal: true
+
+require "fluent/plugin/input"
+
+module Fluent::Plugin
+  require_relative "podinventory_to_mdm"
+
+  class Kube_PodMDMInventory_Input < Input
+    Fluent::Plugin.register_input("kube_podmdminventory", self)
+
+    @@MDMKubePodInventoryTag = "mdm.kubepodinventory"
+
+    def initialize
+      super
+      require "yaml"
+      require "yajl/json_gem"
+      require "yajl"
+      require "set"
+      require "time"
+      require "net/http"
+      require "fileutils"
+      require_relative "ApplicationInsightsUtility"
+      require_relative "oms_common"
+      require_relative "omslog"
+      require_relative "constants"
+    end
+
+    config_param :run_interval, :time, :default => 60
+
+    def configure(conf)
+      super
+      @inventoryToMdmConvertor = Inventory2MdmConvertor.new()
+    end
+
+    def start
+      if @run_interval
+        super
+        $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}")
+        @finished = false
+        @condition = ConditionVariable.new
+        @mutex = Mutex.new
+        @thread = Thread.new(&method(:run_periodic))
+      end
+    end
+
+    def shutdown
+      if @run_interval
+        @mutex.synchronize {
+          @finished = true
+          @condition.signal
+        }
+        @thread.join
+        super # This super must be at the end of shutdown method
+      end
+    end
+
+    def enumerate
+      begin
+        batchTime = currentTime.utc.iso8601
+        parse_and_emit_records(batchTime)
+      rescue => errorStr
+        $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}"
+        $log.debug_backtrace(errorStr.backtrace)
+        ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+      end
+    end
+
+    def parse_and_emit_records(batchTime = Time.utc.iso8601)
+      currentTime = Time.now
+      begin
+        if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE)
+          content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE)
+          if !content.empty?
+            mdmPodRecords = Yajl::Parser.parse(StringIO.new(content))
+            if !mdmPodRecords.nil? && !mdmPodRecords.empty?
+              mdmPodRecords.each do |record|
+                @inventoryToMdmConvertor.process_pod_inventory_record(record)
+                @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
+                containeRecords = record["containeRecords"]
+                if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
+                  containeRecords.each do |containerRecord|
+                    if !containerRecord["state"].nil? && !containerRecord["state"].empty?
+                      @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
+                    end
+                    begin
+                      if !container["lastState"].nil? && container["lastState"].keys.length == 1
+                        lastStateName = container["lastState"].keys[0]
+                        lastStateObject = container["lastState"][lastStateName]
+                        if !lastStateObject.is_a?(Hash)
+                          raise "expected a hash object. This could signify a bug or a kubernetes API change"
+                        end
+                        if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
+                          lastStateReason = lastStateObject["reason"]
+                          lastFinishedTime = lastStateObject["finishedAt"]
+                          #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
+                          if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+                            @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                          end
+                          lastStateReason = nil
+                        end
+                      end
+                      containerRestartCount = containerRecord["restartCount"]
+                      #Populate mdm metric for container restart count if greater than 0
+                      if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+                        @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                      end
+                    rescue => err
+                      $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
+                      $log.debug_backtrace(err.backtrace)
+                      ApplicationInsightsUtility.sendExceptionTelemetry(err)
+                    end
+                  end
+                end
+              end
+              @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
+              pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
+              @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
+              mdm_pod_inventory_es = Fluent::MultiEventStream.new
+              pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
+                mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
+              } if pod_inventory_mdm_records
+              router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
+            end
+          end
+        else
+          $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}"
+        end
+      rescue => errorStr
+        $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
+        $log.debug_backtrace(errorStr.backtrace)
+        ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+      end
+    end
+
+    def run_periodic
+      @mutex.lock
+      done = @finished
+      @nextTimeToRun = Time.now
+      @waitTimeout = @run_interval
+      until done
+        @nextTimeToRun = @nextTimeToRun + @run_interval
+        @now = Time.now
+        if @nextTimeToRun <= @now
+          @waitTimeout = 1
+          @nextTimeToRun = @now
+        else
+          @waitTimeout = @nextTimeToRun - @now
+        end
+        @condition.wait(@mutex, @waitTimeout)
+        done = @finished
+        @mutex.unlock
+        if !done
+          begin
+            $log.info("in_kube_podmdminventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}")
+            enumerate
+            $log.info("in_kube_podmdminventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}")
+          rescue => errorStr
+            $log.warn "in_kube_podmdminventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}"
+            ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+          end
+        end
+        @mutex.lock
+      end
+      @mutex.unlock
+    end
+  end # Kube_Pod_Input
+end # module

From 4ea1d698ee1cb3a86e85e8b7935ae40ff2c35acc Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 6 Feb 2022 19:03:37 -0800
Subject: [PATCH 24/65] bug fixes

---
 build/linux/installer/conf/kube.conf                |  2 +-
 build/linux/installer/datafiles/base_container.data |  1 +
 kubernetes/linux/main.sh                            |  2 ++
 source/plugins/ruby/in_kube_podmdminventory.rb      | 10 ++++++----
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index c3cbe95b1..d8bcc53da 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -248,7 +248,7 @@
   <worker "#{ENV['FLUENTD_POD_MDM_INVENTORY_WORKER_ID']}">
     #Kubernetes podmdm inventory
     <source>
-     @type kube_pdmdminventory
+     @type kube_podmdminventory
      run_interval 60
      @log_level debug
     </source>
diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data
index 650b19243..328a846c7 100644
--- a/build/linux/installer/datafiles/base_container.data
+++ b/build/linux/installer/datafiles/base_container.data
@@ -162,6 +162,7 @@ MAINTAINER:              'Microsoft Corporation'
 /etc/fluent/plugin/in_containerinventory.rb;			                                     source/plugins/ruby/in_containerinventory.rb;	  	644; root; root
 /etc/fluent/plugin/in_kube_nodes.rb;			                                           source/plugins/ruby/in_kube_nodes.rb;		      	644; root; root
 /etc/fluent/plugin/in_kube_podinventory.rb;			                                     source/plugins/ruby/in_kube_podinventory.rb;			644; root; root
+/etc/fluent/plugin/in_kube_podmdminventory.rb;			                                  source/plugins/ruby/in_kube_podmdminventory.rb;			644; root; root
 /etc/fluent/plugin/in_kube_perfinventory.rb;			                                     source/plugins/ruby/in_kube_perfinventory.rb;			644; root; root
 /etc/fluent/plugin/KubernetesApiClient.rb;			                                     source/plugins/ruby/KubernetesApiClient.rb;			644; root; root
 /etc/fluent/plugin/in_kube_events.rb;			                                           source/plugins/ruby/in_kube_events.rb;			      644; root; root
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index c04fd8eac..c280a31a0 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -640,6 +640,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
             echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
             echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
+            echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
             source ~/.bashrc
 
             echo "*** fluentd worker configuration ***"
@@ -647,6 +648,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
             echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
             echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
+            echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
             echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
 
             echo "*** starting fluentd v1 in replicaset"
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 9432e4fe0..84badf112 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -57,6 +57,7 @@ def shutdown
 
     def enumerate
       begin
+        currentTime = Time.now
         batchTime = currentTime.utc.iso8601
         parse_and_emit_records(batchTime)
       rescue => errorStr
@@ -67,12 +68,13 @@ def enumerate
     end
 
     def parse_and_emit_records(batchTime = Time.utc.iso8601)
-      currentTime = Time.now
       begin
         if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE)
           content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE)
           if !content.empty?
+            $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
             mdmPodRecords = Yajl::Parser.parse(StringIO.new(content))
+            $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
             if !mdmPodRecords.nil? && !mdmPodRecords.empty?
               mdmPodRecords.each do |record|
                 @inventoryToMdmConvertor.process_pod_inventory_record(record)
@@ -84,9 +86,9 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
                       @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
                     end
                     begin
-                      if !container["lastState"].nil? && container["lastState"].keys.length == 1
-                        lastStateName = container["lastState"].keys[0]
-                        lastStateObject = container["lastState"][lastStateName]
+                      if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
+                        lastStateName = containerRecord["lastState"].keys[0]
+                        lastStateObject = containerRecord["lastState"][lastStateName]
                         if !lastStateObject.is_a?(Hash)
                           raise "expected a hash object. This could signify a bug or a kubernetes API change"
                         end

From 45d3e03653b45a62615fa5969e1c2c5554e4d6ea Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 6 Feb 2022 20:41:20 -0800
Subject: [PATCH 25/65] bug fixes

---
 source/plugins/ruby/in_kube_podinventory.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 6369471f4..7b52b97dd 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -347,11 +347,13 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           containerInventoryStream = nil
         end
 
-        if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send
+        if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send
           if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
             mdmPodRecordsJson = @mdmPodRecords.to_json
             @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
+            @log.info "in_kube_podinventory::parse_and_emit_records:Start:atomic_file_write @ #{Time.now.utc.iso8601}"
             atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson)
+            @log.info "in_kube_podinventory::parse_and_emit_records:End:atomic_file_write @ #{Time.now.utc.iso8601}"
           end
         end
 

From 5481e4870e4fadb5a315d5426266efbef7322767 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 7 Feb 2022 08:26:36 -0800
Subject: [PATCH 26/65] bug fixes

---
 .../plugins/ruby/in_kube_podmdminventory.rb   | 99 ++++++++++---------
 1 file changed, 52 insertions(+), 47 deletions(-)

diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 84badf112..30337c9b7 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -70,60 +70,61 @@ def enumerate
     def parse_and_emit_records(batchTime = Time.utc.iso8601)
       begin
         if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE)
-          content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE)
-          if !content.empty?
-            $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
-            mdmPodRecords = Yajl::Parser.parse(StringIO.new(content))
-            $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
-            if !mdmPodRecords.nil? && !mdmPodRecords.empty?
-              mdmPodRecords.each do |record|
-                @inventoryToMdmConvertor.process_pod_inventory_record(record)
-                @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
-                containeRecords = record["containeRecords"]
-                if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
-                  containeRecords.each do |containerRecord|
-                    if !containerRecord["state"].nil? && !containerRecord["state"].empty?
-                      @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
-                    end
-                    begin
-                      if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
-                        lastStateName = containerRecord["lastState"].keys[0]
-                        lastStateObject = containerRecord["lastState"][lastStateName]
-                        if !lastStateObject.is_a?(Hash)
-                          raise "expected a hash object. This could signify a bug or a kubernetes API change"
-                        end
-                        if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
-                          lastStateReason = lastStateObject["reason"]
-                          lastFinishedTime = lastStateObject["finishedAt"]
-                          #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
-                          if lastStateReason.downcase == Constants::REASON_OOM_KILLED
-                            @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
-                          end
-                          lastStateReason = nil
-                        end
+          file = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
+          $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
+          mdmPodRecords = Yajl::Parser.parse(file)
+          if !file.nil?
+            file.close
+          end
+          $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
+          if !mdmPodRecords.nil? && !mdmPodRecords.empty?
+            mdmPodRecords.each do |record|
+              @inventoryToMdmConvertor.process_pod_inventory_record(record)
+              @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
+              containeRecords = record["containeRecords"]
+              if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
+                containeRecords.each do |containerRecord|
+                  if !containerRecord["state"].nil? && !containerRecord["state"].empty?
+                    @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
+                  end
+                  begin
+                    if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
+                      lastStateName = containerRecord["lastState"].keys[0]
+                      lastStateObject = containerRecord["lastState"][lastStateName]
+                      if !lastStateObject.is_a?(Hash)
+                        raise "expected a hash object. This could signify a bug or a kubernetes API change"
                       end
-                      containerRestartCount = containerRecord["restartCount"]
-                      #Populate mdm metric for container restart count if greater than 0
-                      if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
-                        @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                      if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
+                        lastStateReason = lastStateObject["reason"]
+                        lastFinishedTime = lastStateObject["finishedAt"]
+                        #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
+                        if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+                          @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                        end
+                        lastStateReason = nil
                       end
-                    rescue => err
-                      $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
-                      $log.debug_backtrace(err.backtrace)
-                      ApplicationInsightsUtility.sendExceptionTelemetry(err)
                     end
+                    containerRestartCount = containerRecord["restartCount"]
+                    #Populate mdm metric for container restart count if greater than 0
+                    if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+                      @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                    end
+                  rescue => err
+                    $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
+                    $log.debug_backtrace(err.backtrace)
+                    ApplicationInsightsUtility.sendExceptionTelemetry(err)
                   end
                 end
               end
-              @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
-              pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
-              @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
-              mdm_pod_inventory_es = Fluent::MultiEventStream.new
-              pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
-                mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
-              } if pod_inventory_mdm_records
-              router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
             end
+            @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
+            pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
+            @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
+            mdm_pod_inventory_es = Fluent::MultiEventStream.new
+            pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
+              mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
+            } if pod_inventory_mdm_records
+            router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
           end
         else
           $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}"
@@ -132,6 +133,10 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
         $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
         $log.debug_backtrace(errorStr.backtrace)
         ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+      ensure
+        if !file.nil?
+          file.close
+        end
       end
     end
 

From 1ea93668f803f77661987b7edb867fd106fed663 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 7 Feb 2022 19:46:59 -0800
Subject: [PATCH 27/65] podmdm plugin

---
 source/plugins/ruby/constants.rb              |   1 -
 source/plugins/ruby/in_kube_perfinventory.rb  |   2 -
 source/plugins/ruby/in_kube_podinventory.rb   |  38 ++++--
 .../plugins/ruby/in_kube_podmdminventory.rb   | 129 +++++++++++-------
 4 files changed, 107 insertions(+), 63 deletions(-)

diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index 0b16e82f8..5576d9917 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -139,5 +139,4 @@ class Constants
 
   # FileName for MDM POD Inventory records
   MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
-  MDM_POD_INVENTORY_STATE_TEMP_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryStateTemp.json"
 end
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 7403b86f3..9733130af 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -155,7 +155,6 @@ def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationTok
       @@istestvar = ENV["ISTEST"]
 
       begin #begin block start
-        # # Getting windows nodes from kubeapi
         podInventory["items"].each do |item| #podInventory block start
           nodeName = ""
           if !item["spec"]["nodeName"].nil?
@@ -272,7 +271,6 @@ def watch_pods
             @podCacheMutex.synchronize {
               @podItemsCache.clear()
             }
-            currentWindowsNodeNameList = []
             continuationToken = nil
             $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
             continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 7b52b97dd..8432965a4 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -351,9 +351,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
           if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
             mdmPodRecordsJson = @mdmPodRecords.to_json
             @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
-            @log.info "in_kube_podinventory::parse_and_emit_records:Start:atomic_file_write @ #{Time.now.utc.iso8601}"
-            atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson)
-            @log.info "in_kube_podinventory::parse_and_emit_records:End:atomic_file_write @ #{Time.now.utc.iso8601}"
+            @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
+            writeMDMRecords(mdmPodRecordsJson)
+            @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
           end
         end
 
@@ -1097,14 +1097,36 @@ def watch_windows_nodes
       $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}")
     end
 
-    def atomic_file_write(path, temp_path, content)
+    def writeMDMRecords(mdmRecordsJson)
+      maxRetryCount = 3
+      initialRetryDelaySecs = 0.5
+      retryAttemptCount = 1
       begin
-        File.open(temp_path, "w+") do |f|
-          f.write(content)
+        f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w")
+        if !f.nil?
+          isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+          raise "writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+          startTime = (Time.now.to_f * 1000).to_i
+          f.truncate(0)
+          f.write(mdmRecordsJson)
+          f.flush
+          timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+          $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}"
+        else
+          raise "writeMDMRecords:Failed to open file for write"
         end
-        FileUtils.mv(temp_path, path)
       rescue => err
-        $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}"
+        if retryAttemptCount < MaxRetryCount
+          retryAttemptCount = retryAttemptCount + 1
+          sleep (initialRetryDelay * retryAttemptCount)
+          retry
+        end
+        $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+      ensure
+        if !f.nil?
+          f.flock(File::LOCK_UN)
+          f.close
+        end
       end
     end
 
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 30337c9b7..2afa9a547 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -69,65 +69,57 @@ def enumerate
 
     def parse_and_emit_records(batchTime = Time.utc.iso8601)
       begin
-        if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE)
-          file = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
-          $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
-          mdmPodRecords = Yajl::Parser.parse(file)
-          if !file.nil?
-            file.close
-          end
-          $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}"
-          if !mdmPodRecords.nil? && !mdmPodRecords.empty?
-            mdmPodRecords.each do |record|
-              @inventoryToMdmConvertor.process_pod_inventory_record(record)
-              @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
-              containeRecords = record["containeRecords"]
-              if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
-                containeRecords.each do |containerRecord|
-                  if !containerRecord["state"].nil? && !containerRecord["state"].empty?
-                    @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
-                  end
-                  begin
-                    if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
-                      lastStateName = containerRecord["lastState"].keys[0]
-                      lastStateObject = containerRecord["lastState"][lastStateName]
-                      if !lastStateObject.is_a?(Hash)
-                        raise "expected a hash object. This could signify a bug or a kubernetes API change"
-                      end
-                      if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
-                        lastStateReason = lastStateObject["reason"]
-                        lastFinishedTime = lastStateObject["finishedAt"]
-                        #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
-                        if lastStateReason.downcase == Constants::REASON_OOM_KILLED
-                          @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
-                        end
-                        lastStateReason = nil
-                      end
+        $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}"
+        mdmPodRecords = readMDMRecords()
+        $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}"
+        if !mdmPodRecords.nil? && !mdmPodRecords.empty?
+          mdmPodRecords.each do |record|
+            @inventoryToMdmConvertor.process_pod_inventory_record(record)
+            @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
+            containeRecords = record["containeRecords"]
+            if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
+              containeRecords.each do |containerRecord|
+                if !containerRecord["state"].nil? && !containerRecord["state"].empty?
+                  @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
+                end
+                begin
+                  if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1
+                    lastStateName = containerRecord["lastState"].keys[0]
+                    lastStateObject = containerRecord["lastState"][lastStateName]
+                    if !lastStateObject.is_a?(Hash)
+                      raise "expected a hash object. This could signify a bug or a kubernetes API change"
                     end
-                    containerRestartCount = containerRecord["restartCount"]
-                    #Populate mdm metric for container restart count if greater than 0
-                    if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
-                      @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                    if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
+                      lastStateReason = lastStateObject["reason"]
+                      lastFinishedTime = lastStateObject["finishedAt"]
+                      #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled
+                      if lastStateReason.downcase == Constants::REASON_OOM_KILLED
+                        @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                      end
+                      lastStateReason = nil
                     end
-                  rescue => err
-                    $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
-                    $log.debug_backtrace(err.backtrace)
-                    ApplicationInsightsUtility.sendExceptionTelemetry(err)
                   end
+                  containerRestartCount = containerRecord["restartCount"]
+                  #Populate mdm metric for container restart count if greater than 0
+                  if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0)
+                    @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime)
+                  end
+                rescue => err
+                  $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}"
+                  $log.debug_backtrace(err.backtrace)
+                  ApplicationInsightsUtility.sendExceptionTelemetry(err)
                 end
               end
             end
-            @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
-            pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
-            @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
-            mdm_pod_inventory_es = Fluent::MultiEventStream.new
-            pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
-              mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
-            } if pod_inventory_mdm_records
-            router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
           end
-        else
-          $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}"
+          @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
+          pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
+          @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
+          mdm_pod_inventory_es = Fluent::MultiEventStream.new
+          pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
+            mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
+          } if pod_inventory_mdm_records
+          router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
         end
       rescue => errorStr
         $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
@@ -171,5 +163,38 @@ def run_periodic
       end
       @mutex.unlock
     end
+
+    def readMDMRecords()
+      maxRetryCount = 3
+      initialRetryDelaySecs = 0.5
+      retryAttemptCount = 1
+      mdmRecords = {}
+      begin
+        f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
+        if !f.nil?
+          isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+          raise "readMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+          startTime = (Time.now.to_f * 1000).to_i
+          mdmRecords = Yajl::Parser.parse(f)
+          timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+          $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
+        else
+          raise "readMDMRecords:Failed to open file for read"
+        end
+      rescue => err
+        if retryAttemptCount < MaxRetryCount
+          retryAttemptCount = retryAttemptCount + 1
+          sleep (initialRetryDelay * retryAttemptCount)
+          retry
+        end
+        $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+      ensure
+        if !f.nil?
+          f.flock(File::LOCK_UN)
+          f.close
+        end
+      end
+      return mdmRecords
+    end
   end # Kube_Pod_Input
 end # module

From a12e535c52fdd8a42cf4bd444118e4898be33fc5 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 8 Feb 2022 00:21:44 -0800
Subject: [PATCH 28/65] bug fixes

---
 build/linux/installer/conf/kube.conf          |  2 +-
 source/plugins/ruby/in_kube_podinventory.rb   | 21 +++++-----
 .../plugins/ruby/in_kube_podmdminventory.rb   | 39 ++++++++++---------
 3 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index d8bcc53da..dcdf1cdf8 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -262,7 +262,7 @@
       path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length 50
       flush_interval 20s
       retry_max_times 10
       retry_wait 5s
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 8432965a4..aac0247c3 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -516,7 +516,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
         mdmPodRecord["PodStatus"] = record["PodStatus"]
         mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"])
         mdmPodRecord["ControllerKind"] = record["ControllerKind"]
-        mdmPodRecord["containeRecords"] = []
+        mdmPodRecord["containerRecords"] = []
 
         podContainers = []
         if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty?
@@ -630,7 +630,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
             end
 
             if !mdmContainerRecord.empty?
-              mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup)
+              mdmPodRecord["containerRecords"].push(mdmContainerRecord.dup)
             end
 
             podRestartCount += containerRestartCount
@@ -1105,28 +1105,27 @@ def writeMDMRecords(mdmRecordsJson)
         f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w")
         if !f.nil?
           isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
-          raise "writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+          raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock
           startTime = (Time.now.to_f * 1000).to_i
-          f.truncate(0)
           f.write(mdmRecordsJson)
           f.flush
           timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
           $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}"
         else
-          raise "writeMDMRecords:Failed to open file for write"
+          raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write"
         end
       rescue => err
-        if retryAttemptCount < MaxRetryCount
+        if retryAttemptCount < maxRetryCount
+          f.flock(File::LOCK_UN) if !f.nil?
+          f.close if !f.nil?
           retryAttemptCount = retryAttemptCount + 1
-          sleep (initialRetryDelay * retryAttemptCount)
+          sleep (initialRetryDelaySecs * retryAttemptCount)
           retry
         end
         $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
       ensure
-        if !f.nil?
-          f.flock(File::LOCK_UN)
-          f.close
-        end
+        f.flock(File::LOCK_UN) if !f.nil?
+        f.close if !f.nil?
       end
     end
 
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 2afa9a547..a23a84c9b 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -72,13 +72,13 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
         $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}"
         mdmPodRecords = readMDMRecords()
         $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}"
-        if !mdmPodRecords.nil? && !mdmPodRecords.empty?
+        if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmRecords.length > 0
           mdmPodRecords.each do |record|
             @inventoryToMdmConvertor.process_pod_inventory_record(record)
             @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
-            containeRecords = record["containeRecords"]
-            if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0
-              containeRecords.each do |containerRecord|
+            containerRecords = record["containerRecords"]
+            if !containerRecords.nil? && !containerRecords.empty? && containerRecords.length > 0
+              containerRecords.each do |containerRecord|
                 if !containerRecord["state"].nil? && !containerRecord["state"].empty?
                   @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"])
                 end
@@ -118,17 +118,20 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
           mdm_pod_inventory_es = Fluent::MultiEventStream.new
           pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
             mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
+            if mdm_pod_inventory_es.count >= 5000 # 5k records of MDM is ~2MB and each record is ~400 bytes
+              router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
+              mdm_pod_inventory_es = Fluent::MultiEventStream.new
+            end
           } if pod_inventory_mdm_records
-          router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es
+          if mdm_pod_inventory_es.count > 0
+            router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
+          end
+          mdm_pod_inventory_es = nil
         end
       rescue => errorStr
         $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
         $log.debug_backtrace(errorStr.backtrace)
         ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
-      ensure
-        if !file.nil?
-          file.close
-        end
       end
     end
 
@@ -168,31 +171,31 @@ def readMDMRecords()
       maxRetryCount = 3
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
-      mdmRecords = {}
+      mdmRecords = []
       begin
         f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
         if !f.nil?
           isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
-          raise "readMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+          raise "in_kube_podmdminventory:readMDMRecords:Failed to acquire file lock" if !isAcquiredLock
           startTime = (Time.now.to_f * 1000).to_i
           mdmRecords = Yajl::Parser.parse(f)
           timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
           $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
         else
-          raise "readMDMRecords:Failed to open file for read"
+          raise "in_kube_podmdminventory:readMDMRecords:Failed to open file for read"
         end
       rescue => err
-        if retryAttemptCount < MaxRetryCount
+        if retryAttemptCount < maxRetryCount
+          f.flock(File::LOCK_UN) if !f.nil?
+          f.close if !f.nil?
           retryAttemptCount = retryAttemptCount + 1
-          sleep (initialRetryDelay * retryAttemptCount)
+          sleep (initialRetryDelaySecs * retryAttemptCount)
           retry
         end
         $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
       ensure
-        if !f.nil?
-          f.flock(File::LOCK_UN)
-          f.close
-        end
+        f.flock(File::LOCK_UN) if !f.nil?
+        f.close if !f.nil?
       end
       return mdmRecords
     end

From 03e0b439acbd68ee3f61bf8969b09fd4bdbe7837 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 8 Feb 2022 11:18:37 -0800
Subject: [PATCH 29/65] bug fixes

---
 source/plugins/ruby/in_kube_podmdminventory.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index a23a84c9b..971197f49 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -72,7 +72,7 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
         $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}"
         mdmPodRecords = readMDMRecords()
         $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}"
-        if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmRecords.length > 0
+        if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0
           mdmPodRecords.each do |record|
             @inventoryToMdmConvertor.process_pod_inventory_record(record)
             @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])

From ab27436bd6decec7e90e51999697a842937727a2 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 8 Feb 2022 12:51:19 -0800
Subject: [PATCH 30/65] remove unneeded log lines

---
 source/plugins/ruby/in_kube_nodes.rb         |  5 +++--
 source/plugins/ruby/in_kube_perfinventory.rb |  8 +++----
 source/plugins/ruby/in_kube_podinventory.rb  | 23 ++++++++++----------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 3e8e8ee71..146da8f9d 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -652,7 +652,7 @@ def watch_nodes
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
                     $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     nodesResourceVersion = nil
@@ -691,7 +691,8 @@ def watch_nodes
               end
             end
           rescue Net::ReadTimeout => errorStr
-            $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+            # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
             $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             nodesResourceVersion = nil
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 9733130af..c37c3ce0e 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -340,9 +340,9 @@ def watch_pods
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     podsResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                    # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
-                    $log.info("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     podsResourceVersion = nil
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
@@ -377,11 +377,11 @@ def watch_pods
                   $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
-              $log.info("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
             end
           rescue Net::ReadTimeout => errorStr
             ## This expected if there is no activity more than readtimeout value used in the connection
-            $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
             $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             podsResourceVersion = nil
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index aac0247c3..24eea4dbf 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -795,9 +795,9 @@ def watch_pods
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     podsResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                    # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
-                    $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     podsResourceVersion = nil
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
@@ -844,11 +844,11 @@ def watch_pods
                   $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
-              $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
             end
           rescue Net::ReadTimeout => errorStr
-            ## This expected if there is no activity more than readtimeout value used in the connection
-            $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+            # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
             $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             podsResourceVersion = nil
@@ -923,9 +923,9 @@ def watch_services
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     servicesResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
-                    $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     servicesResourceVersion = nil
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
@@ -962,7 +962,7 @@ def watch_services
               end
             end
           rescue Net::ReadTimeout => errorStr
-            $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
             $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             servicesResourceVersion = nil
@@ -1051,9 +1051,9 @@ def watch_windows_nodes
                      !item["metadata"].nil? && !item["metadata"].empty? &&
                      !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
                     nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
                   else
-                    $log.info("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                    $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
                     nodesResourceVersion = nil
                     # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
                     break
@@ -1081,7 +1081,8 @@ def watch_windows_nodes
               end
             end
           rescue Net::ReadTimeout => errorStr
-            $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            ## This expected if there is no activity more than readtimeout value used in the connection
+            # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
           rescue => errorStr
             $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
             nodesResourceVersion = nil

From 541e50da0b3f1cb2433c4be5c11da50e8ba06eb2 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 8 Feb 2022 19:06:29 -0800
Subject: [PATCH 31/65] more improvements

---
 source/plugins/ruby/constants.rb              |   1 +
 source/plugins/ruby/in_kube_nodes.rb          |  53 ++++++
 source/plugins/ruby/in_kube_perfinventory.rb  | 170 ++++--------------
 source/plugins/ruby/in_kube_podinventory.rb   |   1 +
 .../plugins/ruby/in_kube_podmdminventory.rb   |  17 +-
 5 files changed, 95 insertions(+), 147 deletions(-)

diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index 5576d9917..6f8c1256f 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -139,4 +139,5 @@ class Constants
 
   # FileName for MDM POD Inventory records
   MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
+  NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json"
 end
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 146da8f9d..d3077e713 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -202,10 +202,19 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
         insightsMetricsEventStream = Fluent::MultiEventStream.new
         kubePerfEventStream = Fluent::MultiEventStream.new
         @@istestvar = @env["ISTEST"]
+        nodeAllocatableRecords = {}
         #get node inventory
         nodeInventory["items"].each do |item|
           # node inventory
           nodeInventoryRecord = getNodeInventoryRecord(item, batchTime)
+          # node allocatble records for the kube perf plugin
+          nodeName = item["metadata"]["name"]
+          if !nodeName.nil? && !nodeName.empty?
+            nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
+            if !nodeAllocatable.nil? && !nodeAllocatable.empty?
+              nodeAllocatableRecords[nodeName] = nodeAllocatable
+            end
+          end
           eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord
           if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE
             $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}")
@@ -425,6 +434,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
             $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
           end
         end
+        if !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty?
+          nodeAllocatableRecordsJson = nodeAllocatableRecords.to_json
+          if !nodeAllocatableRecordsJson.empty?
+            @log.info "Writing node allocatable records to state file with size(bytes): #{nodeAllocatableRecordsJson.length}"
+            @log.info "in_kube_nodes::parse_and_emit_records:Start:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}"
+            writeNodeAllocatableRecords(nodeAllocatableRecordsJson)
+            @log.info "in_kube_nodes::parse_and_emit_records:End:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}"
+          end
+          nodeAllocatableRecordsJson = nil
+          nodeAllocatableRecords = nil
+        end
       rescue => errorStr
         $log.warn "Failed to retrieve node inventory: #{errorStr}"
         $log.debug_backtrace(errorStr.backtrace)
@@ -707,6 +727,39 @@ def watch_nodes
       end
       $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}")
     end
+
+    def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson)
+      maxRetryCount = 3
+      initialRetryDelaySecs = 0.5
+      retryAttemptCount = 1
+      begin
+        f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w")
+        if !f.nil?
+          isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+          raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock
+          startTime = (Time.now.to_f * 1000).to_i
+          f.write(nodeAllocatbleRecordsJson)
+          f.flush
+          timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+          $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}"
+        else
+          raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write"
+        end
+      rescue => err
+        if retryAttemptCount < maxRetryCount
+          f.flock(File::LOCK_UN) if !f.nil?
+          f.close if !f.nil?
+          retryAttemptCount = retryAttemptCount + 1
+          sleep (initialRetryDelaySecs * retryAttemptCount)
+          retry
+        end
+        $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+        ApplicationInsightsUtility.sendExceptionTelemetry(err)
+      ensure
+        f.flock(File::LOCK_UN) if !f.nil?
+        f.close if !f.nil?
+      end
+    end
   end # Kube_Node_Input
 
   class NodeStatsCache
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index c37c3ce0e..5faae3194 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -33,7 +33,6 @@ def initialize
       @podItemsCache = {}
 
       @watchNodesThread = nil
-      @nodeAllocatableCache = {}
 
       @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
@@ -80,9 +79,7 @@ def start
         @condition = ConditionVariable.new
         @mutex = Mutex.new
         @podCacheMutex = Mutex.new
-        @nodeAllocatableCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
-        @watchNodesThread = Thread.new(&method(:watch_nodes))
         @watchPodsThread = Thread.new(&method(:watch_pods))
       end
     end
@@ -118,11 +115,7 @@ def enumerate(podList = nil)
           $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}")
         end
 
-        nodeAllocatableRecords = {}
-        nodeAllocatableCacheSizeKB = 0
-        @nodeAllocatableCacheMutex.synchronize {
-          nodeAllocatableRecords = @nodeAllocatableCache.clone
-        }
+        nodeAllocatableRecords = getNodeAllocatableRecords()
         $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
         # Initializing continuation token to nil
         continuationToken = nil
@@ -397,139 +390,38 @@ def watch_pods
       $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}")
     end
 
-    def watch_nodes
-      $log.info("in_kube_perfinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}")
-      nodesResourceVersion = nil
-      loop do
-        begin
-          if nodesResourceVersion.nil?
-            # clear node limits cache before filling the cache with list
-            @nodeAllocatableCacheMutex.synchronize {
-              @nodeAllocatableCache.clear()
-            }
-            continuationToken = nil
-            $log.info("in_kube_perfinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
-            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-            $log.info("in_kube_perfinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
-            if (!nodeInventory.nil? && !nodeInventory.empty?)
-              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                $log.info("in_kube_perfinventory::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                nodeInventory["items"].each do |item|
-                  key = item["metadata"]["name"]
-                  if !key.nil? && !key.empty?
-                    nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                    if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                      @nodeAllocatableCacheMutex.synchronize {
-                        @nodeAllocatableCache[key] = nodeAllocatable
-                      }
-                    else
-                      $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  else
-                    $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
-            else
-              $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
-            end
-            while (!continuationToken.nil? && !continuationToken.empty?)
-              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
-              if (!nodeInventory.nil? && !nodeInventory.empty?)
-                nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-                if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                  $log.info("in_kube_perfinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
-                  nodeInventory["items"].each do |item|
-                    key = item["metadata"]["name"]
-                    if !key.nil? && !key.empty?
-                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                        @nodeAllocatableCacheMutex.synchronize {
-                          @nodeAllocatableCache[key] = nodeAllocatable
-                        }
-                      else
-                        $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                      end
-                    else
-                      $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  end
-                end
-              else
-                $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
-              end
-            end
-          end
-          begin
-            $log.info("in_kube_perfinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_perfinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    $log.info("in_kube_perfinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.info("in_kube_perfinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                    nodesResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                    break
-                  end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                    key = item["metadata"]["name"]
-                    if !key.nil? && !key.empty?
-                      nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item)
-                      if !nodeAllocatable.nil? && !nodeAllocatable.empty?
-                        @nodeAllocatableCacheMutex.synchronize {
-                          @nodeAllocatableCache[key] = nodeAllocatable
-                        }
-                      else
-                        $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                      end
-                    else
-                      $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["name"]
-                    if !key.nil? && !key.empty?
-                      @nodeAllocatableCacheMutex.synchronize {
-                        @nodeAllocatableCache.delete(key)
-                      }
-                    end
-                  end
-                when "ERROR"
-                  nodesResourceVersion = nil
-                  $log.warn("in_kube_perfinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_perfinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
-                end
-              end
-            end
-          rescue Net::ReadTimeout => errorStr
-            $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            nodesResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
-          end
-        rescue => errorStr
-          $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          nodesResourceVersion = nil
+    def getNodeAllocatableRecords()
+      maxRetryCount = 3
+      initialRetryDelaySecs = 0.5
+      retryAttemptCount = 1
+      nodeAllocatableRecords = {}
+      begin
+        f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r")
+        if !f.nil?
+          isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
+          raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock
+          startTime = (Time.now.to_f * 1000).to_i
+          nodeAllocatableRecords = Yajl::Parser.parse(f)
+          timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+          $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
+        else
+          raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read"
+        end
+      rescue => err
+        if retryAttemptCount < maxRetryCount
+          f.flock(File::LOCK_UN) if !f.nil?
+          f.close if !f.nil?
+          retryAttemptCount = retryAttemptCount + 1
+          sleep (initialRetryDelaySecs * retryAttemptCount)
+          retry
         end
+        $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+        ApplicationInsightsUtility.sendExceptionTelemetry(err)
+      ensure
+        f.flock(File::LOCK_UN) if !f.nil?
+        f.close if !f.nil?
       end
-      $log.info("in_kube_perfinventory::watch_nodes:End @ #{Time.now.utc.iso8601}")
+      return nodeAllocatableRecords
     end
   end # Kube_Pod_Input
 end # module
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 24eea4dbf..70167e012 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -1124,6 +1124,7 @@ def writeMDMRecords(mdmRecordsJson)
           retry
         end
         $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+        ApplicationInsightsUtility.sendExceptionTelemetry(err)
       ensure
         f.flock(File::LOCK_UN) if !f.nil?
         f.close if !f.nil?
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 971197f49..98f06dc0c 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -69,9 +69,9 @@ def enumerate
 
     def parse_and_emit_records(batchTime = Time.utc.iso8601)
       begin
-        $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}"
-        mdmPodRecords = readMDMRecords()
-        $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}"
+        $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}"
+        mdmPodRecords = getMDMRecords()
+        $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
         if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0
           mdmPodRecords.each do |record|
             @inventoryToMdmConvertor.process_pod_inventory_record(record)
@@ -167,7 +167,7 @@ def run_periodic
       @mutex.unlock
     end
 
-    def readMDMRecords()
+    def getMDMRecords()
       maxRetryCount = 3
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
@@ -176,13 +176,13 @@ def readMDMRecords()
         f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
         if !f.nil?
           isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB)
-          raise "in_kube_podmdminventory:readMDMRecords:Failed to acquire file lock" if !isAcquiredLock
+          raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock
           startTime = (Time.now.to_f * 1000).to_i
           mdmRecords = Yajl::Parser.parse(f)
           timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
-          $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
+          $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
         else
-          raise "in_kube_podmdminventory:readMDMRecords:Failed to open file for read"
+          raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read"
         end
       rescue => err
         if retryAttemptCount < maxRetryCount
@@ -192,7 +192,8 @@ def readMDMRecords()
           sleep (initialRetryDelaySecs * retryAttemptCount)
           retry
         end
-        $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+        $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
+        ApplicationInsightsUtility.sendExceptionTelemetry(err)
       ensure
         f.flock(File::LOCK_UN) if !f.nil?
         f.close if !f.nil?

From 589b69a9472eed0321781afa48ebf4daca586d71 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 9 Feb 2022 14:34:05 -0800
Subject: [PATCH 32/65] clean up

---
 kubernetes/omsagent.yaml                     |  2 +-
 source/plugins/ruby/KubernetesApiClient.rb   | 19 ++++++++++
 source/plugins/ruby/constants.rb             |  3 +-
 source/plugins/ruby/in_kube_perfinventory.rb | 13 -------
 source/plugins/ruby/in_kube_podinventory.rb  | 37 ++++++--------------
 5 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 5c9e8f853..b0ccb6712 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -609,7 +609,7 @@ spec:
           imagePullPolicy: IfNotPresent
           resources:
             limits:
-              cpu: 4
+              cpu: 5
               memory: 2Gi
             requests:
               cpu: 150m
diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index f1afd4ac6..0d4267685 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -1310,6 +1310,25 @@ def getHpaOptimizedItem(resourceItem)
       return item
     end
 
+    def getPodReadyCondition(podStatusConditions)
+      podReadyCondition = false
+      begin
+        if !podStatusConditions.nil? && !podStatusConditions.empty?
+          podStatusConditions.each do |condition|
+            if condition["type"] == "Ready"
+              if condition["status"].downcase == "true"
+                podReadyCondition = true
+              end
+              break #Exit the for loop since we found the ready condition
+            end
+          end
+        end
+      rescue => err
+        @Log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}"
+      end
+      return podReadyCondition
+    end
+
     def isEmitCacheTelemetry
       isEmitCacheTelemtryEnabled = false
       if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true"
diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index 6f8c1256f..ca966fb12 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -137,7 +137,8 @@ class Constants
   #To evaluate switching to Windows AMA 64KB impacts any existing customers
   MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536
 
-  # FileName for MDM POD Inventory records
+  # FileName for MDM POD Inventory state
   MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
+  # FileName for NodeAllocatable Records state
   NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json"
 end
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 5faae3194..00f7b02db 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -27,13 +27,10 @@ def initialize
       # this configurable via configmap
       @PODS_CHUNK_SIZE = 0
       @PODS_EMIT_STREAM_BATCH_SIZE = 0
-      @NODES_CHUNK_SIZE = 0
 
       @watchPodsThread = nil
       @podItemsCache = {}
 
-      @watchNodesThread = nil
-
       @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB"
       @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB"
     end
@@ -66,15 +63,6 @@ def start
         end
         $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE  @ #{@PODS_EMIT_STREAM_BATCH_SIZE}")
 
-        if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0
-          @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i
-        else
-          # this shouldnt happen just setting default here as safe guard
-          $log.warn("in_kube_perfinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty")
-          @NODES_CHUNK_SIZE = 250
-        end
-        $log.info("in_kube_perfinventory::start : NODES_CHUNK_SIZE  @ #{@NODES_CHUNK_SIZE}")
-
         @finished = false
         @condition = ConditionVariable.new
         @mutex = Mutex.new
@@ -92,7 +80,6 @@ def shutdown
         }
         @thread.join
         @watchPodsThread.join
-        @watchNodesThread.join
         super # This super must be at the end of shutdown method
       end
     end
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 70167e012..905fd0e34 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -348,12 +348,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
         end
 
         if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send
-          if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
-            mdmPodRecordsJson = @mdmPodRecords.to_json
-            @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
-            @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
-            writeMDMRecords(mdmPodRecordsJson)
-            @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
+          begin
+            if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
+              mdmPodRecordsJson = @mdmPodRecords.to_json
+              @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
+              @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
+              writeMDMRecords(mdmPodRecordsJson)
+              @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
+            end
+          rescue => err
+            @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}"
           end
         end
 
@@ -514,7 +518,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
         mdmPodRecord["ControllerName"] = record["ControllerName"]
         mdmPodRecord["Namespace"] = record["Namespace"]
         mdmPodRecord["PodStatus"] = record["PodStatus"]
-        mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"])
+        mdmPodRecord["PodReadyCondition"] = KubernetesApiClient.getPodReadyCondition(item["status"]["conditions"])
         mdmPodRecord["ControllerKind"] = record["ControllerKind"]
         mdmPodRecord["containerRecords"] = []
 
@@ -1130,24 +1134,5 @@ def writeMDMRecords(mdmRecordsJson)
         f.close if !f.nil?
       end
     end
-
-    def getPodReadyCondition(podStatusConditions)
-      podReadyCondition = false
-      begin
-        if !podStatusConditions.nil? && !podStatusConditions.empty?
-          podStatusConditions.each do |condition|
-            if condition["type"] == "Ready"
-              if condition["status"].downcase == "true"
-                podReadyCondition = true
-              end
-              break #Exit the for loop since we found the ready condition
-            end
-          end
-        end
-      rescue => err
-        $log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}"
-      end
-      return podReadyCondition
-    end
   end # Kube_Pod_Input
 end # module

From 37d67b859c6ced09472340584040407f8c7a9897 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 9 Feb 2022 16:33:49 -0800
Subject: [PATCH 33/65] clean up

---
 .../templates/omsagent-deployment.yaml        |   5 +
 kubernetes/linux/main.sh                      | 126 +++++++++---------
 source/plugins/ruby/in_kube_podinventory.rb   |   6 +-
 3 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
index a7ea8b097..ac7cafa13 100644
--- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml
@@ -43,6 +43,11 @@ spec:
        resources:
 {{ toYaml .Values.omsagent.resources.deployment | indent 9 }}
        env:
+       - name: NUM_OF_FLUENTD_WORKERS
+         valueFrom:
+           resourceFieldRef:
+             containerName: omsagent
+             resource: limits.cpu
        {{- if ne .Values.omsagent.env.clusterId "<your_cluster_id>" }}
        - name: AKS_RESOURCE_ID
          value: {{ .Values.omsagent.env.clusterId | quote }}
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index c280a31a0..5f3c4c902 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -80,6 +80,66 @@ checkAgentOnboardingStatus() {
       fi
 }
 
+configureFluentDWorkerIDsForRS() {
+      echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
+      case $NUM_OF_FLUENTD_WORKERS in
+      5)
+            export NUM_OF_FLUENTD_WORKERS=5
+            export FLUENTD_POD_INVENTORY_WORKER_ID=4
+            export FLUENTD_NODE_INVENTORY_WORKER_ID=3
+            export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
+            export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
+            export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            ;;
+      4)
+            export NUM_OF_FLUENTD_WORKERS=4
+            export FLUENTD_POD_INVENTORY_WORKER_ID=3
+            export FLUENTD_NODE_INVENTORY_WORKER_ID=2
+            export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
+            export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+            export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            ;;
+      3)
+            export NUM_OF_FLUENTD_WORKERS=3
+            export FLUENTD_POD_INVENTORY_WORKER_ID=2
+            export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+            export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+            export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+            export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            ;;
+      2)
+            export NUM_OF_FLUENTD_WORKERS=2
+            export FLUENTD_POD_INVENTORY_WORKER_ID=1
+            export FLUENTD_NODE_INVENTORY_WORKER_ID=1
+            export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+            export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+            export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            ;;
+
+      *)
+            export NUM_OF_FLUENTD_WORKERS=1
+            export FLUENTD_POD_INVENTORY_WORKER_ID=0
+            export FLUENTD_NODE_INVENTORY_WORKER_ID=0
+            export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
+            export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
+            export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            ;;
+      esac
+      echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
+      echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
+      source ~/.bashrc
+
+      echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
+      echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
+      echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
+      echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
+      echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
+}
+
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
 mkdir -p /var/opt/microsoft/docker-cimprov/state
 
@@ -202,7 +262,7 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
             export MDSD_PROXY_USERNAME=$user
             echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >>~/.bashrc
             export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password
-            echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc
+            echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc
 
             #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD
             export MDSD_ODS_COMPRESSION_LEVEL=0
@@ -434,7 +494,6 @@ fi
 export CONTAINER_RUNTIME="containerd"
 export NODE_NAME=""
 
-
 if [ "$cAdvisorIsSecure" = true ]; then
       echo "Using port 10250"
       export IS_SECURE_CADVISOR_PORT=true
@@ -460,7 +519,7 @@ if [ ! -z "$podWithValidContainerId" ]; then
       containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]")
       nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]")
       # use default container runtime if obtained runtime value is either empty or null
-      if [ -z "$containerRuntime" -o "$containerRuntime" == null  ]; then
+      if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then
             echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null"
       else
             export CONTAINER_RUNTIME=$containerRuntime
@@ -592,65 +651,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "*** starting fluentd v1 in daemonset"
             fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
-            case $NUM_OF_FLUENTD_WORKERS in
-            5)
-                  export NUM_OF_FLUENTD_WORKERS=5
-                  export FLUENTD_POD_INVENTORY_WORKER_ID=4
-                  export FLUENTD_NODE_INVENTORY_WORKER_ID=3
-                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
-                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
-                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-                  ;;
-            4)
-                  export NUM_OF_FLUENTD_WORKERS=4
-                  export FLUENTD_POD_INVENTORY_WORKER_ID=3
-                  export FLUENTD_NODE_INVENTORY_WORKER_ID=2
-                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
-                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
-                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-                  ;;
-            3)
-                  export NUM_OF_FLUENTD_WORKERS=3
-                  export FLUENTD_POD_INVENTORY_WORKER_ID=2
-                  export FLUENTD_NODE_INVENTORY_WORKER_ID=1
-                  export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
-                  export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
-                  export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-                  ;;
-            2)
-                 export NUM_OF_FLUENTD_WORKERS=2
-                 export FLUENTD_POD_INVENTORY_WORKER_ID=1
-                 export FLUENTD_NODE_INVENTORY_WORKER_ID=1
-                 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
-                 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
-                 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-                  ;;
-
-            *)
-                 export NUM_OF_FLUENTD_WORKERS=1
-                 export FLUENTD_POD_INVENTORY_WORKER_ID=0
-                 export FLUENTD_NODE_INVENTORY_WORKER_ID=0
-                 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
-                 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
-                 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-                  ;;
-            esac
-            echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
-            echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
-            echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
-            echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
-            echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
-            echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
-            source ~/.bashrc
-
-            echo "*** fluentd worker configuration ***"
-            echo "num of workers:${NUM_OF_FLUENTD_WORKERS}"
-            echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
-            echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}"
-            echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
-            echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
-            echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
-
+            echo "*** configure fluentd worker ids"
+            configureFluentDWorkerIDsForRS
             echo "*** starting fluentd v1 in replicaset"
             fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       fi
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 905fd0e34..455444f85 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -53,7 +53,7 @@ def initialize
       @watchServicesThread = nil
       @serviceItemsCache = {}
 
-      @watchNodesThread = nil
+      @watchWinNodesThread = nil
       @windowsNodeNameListCache = []
       @windowsContainerRecordsCacheSizeBytes = 0
 
@@ -105,7 +105,7 @@ def start
         @serviceCacheMutex = Mutex.new
         @windowsNodeNameCacheMutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
-        @watchNodesThread = Thread.new(&method(:watch_windows_nodes))
+        @watchWinNodesThread = Thread.new(&method(:watch_windows_nodes))
         @watchPodsThread = Thread.new(&method(:watch_pods))
         @watchServicesThread = Thread.new(&method(:watch_services))
         @@podTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -121,7 +121,7 @@ def shutdown
         @thread.join
         @watchPodsThread.join
         @watchServicesThread.join
-        @watchNodesThread.join
+        @watchWinNodesThread.join
         super # This super must be at the end of shutdown method
       end
     end

From 886557b87c834756951ac0497bb9355aa371abe5 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 10 Feb 2022 23:05:36 -0800
Subject: [PATCH 34/65] add requestId header for mdm metrics

---
 source/plugins/ruby/out_mdm.rb | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb
index 82d6e07db..a542c5eb0 100644
--- a/source/plugins/ruby/out_mdm.rb
+++ b/source/plugins/ruby/out_mdm.rb
@@ -1,7 +1,7 @@
 #!/usr/local/bin/ruby
 # frozen_string_literal: true
 
-require 'fluent/plugin/output'
+require "fluent/plugin/output"
 
 module Fluent::Plugin
   class OutputMDM < Output
@@ -12,6 +12,7 @@ def initialize
       super
       require "net/http"
       require "net/https"
+      require "securerandom"
       require "uri"
       require "yajl/json_gem"
       require_relative "KubernetesApiClient"
@@ -326,47 +327,49 @@ def send_to_mdm(post_body)
         else
           access_token = get_access_token
         end
+        requestId = SecureRandom.uuid.to_s
         request = Net::HTTP::Post.new(@post_request_uri.request_uri)
         request["Content-Type"] = "application/x-ndjson"
         request["Authorization"] = "Bearer #{access_token}"
+        request["x-request-id"] = requestId
 
         request.body = post_body.join("\n")
-        @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}"
+        @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}"
         response = @http_client.request(request)
         response.value # this throws for non 200 HTTP response code
-        @log.info "HTTP Post Response Code : #{response.code}"
+        @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}"
         if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now
           ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {})
           @last_telemetry_sent_time = Time.now
         end
-      rescue Net::HTTPClientException  => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException
+      rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException
         if !response.nil? && !response.body.nil? #body will have actual error
-          @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}"
+          @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response.body: #{response.body}"
         else
-          @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}"
+          @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
         end
         @log.debug_backtrace(e.backtrace)
         if !response.code.empty? && response.code == 403.to_s
-          @log.info "Response Code #{response.code} Updating @last_post_attempt_time"
+          @log.info "Response Code #{response.code} for requestId: #{requestId} Updating @last_post_attempt_time"
           @last_post_attempt_time = Time.now
           @first_post_attempt_made = true
           # Not raising exception, as that will cause retries to happen
         elsif !response.code.empty? && response.code.start_with?("4")
           # Log 400 errors and continue
-          @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}"
+          @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
         else
           # raise if the response code is non-400
-          @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
+          @log.info "HTTPServerException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
           raise e
         end
         # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes
         exception_aggregator(e)
       rescue Errno::ETIMEDOUT => e
-        @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}"
+        @log.info "Timed out when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
         @log.debug_backtrace(e.backtrace)
         raise e
       rescue Exception => e
-        @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}"
+        @log.info "Exception POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}"
         @log.debug_backtrace(e.backtrace)
         raise e
       end

From c594c5a3bac3d9202faf907a5f1dc9db7a8a30a3 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 16 Feb 2022 22:56:18 -0800
Subject: [PATCH 35/65] latest mdsd and fix for threading issue in out mdm

---
 build/linux/installer/conf/kube.conf       | 26 +++++++++++-----------
 charts/azuremonitor-containers/values.yaml |  2 +-
 kubernetes/linux/main.sh                   | 26 +++++++++++++++++-----
 kubernetes/linux/setup.sh                  |  5 +++--
 kubernetes/omsagent.yaml                   |  6 +++--
 source/plugins/ruby/out_mdm.rb             | 24 +++++++++-----------
 6 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index dcdf1cdf8..016a7942a 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -21,7 +21,7 @@
      overflow_action drop_oldest_chunk
      chunk_limit_size 4m
      queue_limit_length 20
-     flush_interval 20s
+     flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
      retry_max_times 10
      retry_wait 5s
      retry_max_interval 5m
@@ -54,7 +54,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -90,7 +90,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -116,7 +116,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -151,7 +151,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -182,7 +182,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -201,7 +201,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -236,7 +236,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -263,7 +263,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 50
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -350,7 +350,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -377,7 +377,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
@@ -403,7 +403,7 @@
     #   overflow_action drop_oldest_chunk
     #   chunk_limit_size 4m
     #   queue_limit_length 20
-    #   flush_interval 20s
+    #   flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
     #   retry_max_times 10
     #   retry_wait 5s
     #   retry_max_interval 5m
@@ -422,7 +422,7 @@
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
       queue_limit_length 20
-      flush_interval 20s
+      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml
index aa4c6bcf2..e15791d21 100644
--- a/charts/azuremonitor-containers/values.yaml
+++ b/charts/azuremonitor-containers/values.yaml
@@ -25,7 +25,7 @@ omsagent:
     tagWindows: "win-ciprod01312022"
     pullPolicy: IfNotPresent
     dockerProviderVersion: "16.0.0-0"
-    agentVersion: "azure-mdsd-1.14.2"
+    agentVersion: "azure-mdsd-1..17.0"
     winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent
 
   # The priority used by the omsagent priority class for the daemonset pods
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 5f3c4c902..80d13805c 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -80,8 +80,9 @@ checkAgentOnboardingStatus() {
       fi
 }
 
-configureFluentDWorkerIDsForRS() {
+configureFluentDConfigForRS() {
       echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
+      export FLUENTD_FLUSH_INTERVAL="20s" # default 20s, evaluate if required lower flush interval at high scale
       case $NUM_OF_FLUENTD_WORKERS in
       5)
             export NUM_OF_FLUENTD_WORKERS=5
@@ -90,6 +91,7 @@ configureFluentDWorkerIDsForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            #export FLUENTD_FLUSH_INTERVAL="5s"
             ;;
       4)
             export NUM_OF_FLUENTD_WORKERS=4
@@ -98,6 +100,7 @@ configureFluentDWorkerIDsForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            #export FLUENTD_FLUSH_INTERVAL="10s"
             ;;
       3)
             export NUM_OF_FLUENTD_WORKERS=3
@@ -106,6 +109,7 @@ configureFluentDWorkerIDsForRS() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+           #export FLUENTD_FLUSH_INTERVAL="15s"
             ;;
       2)
             export NUM_OF_FLUENTD_WORKERS=2
@@ -114,6 +118,7 @@ configureFluentDWorkerIDsForRS() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            #export FLUENTD_FLUSH_INTERVAL="20s"
             ;;
 
       *)
@@ -123,6 +128,7 @@ configureFluentDWorkerIDsForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
+            #export FLUENTD_FLUSH_INTERVAL="20s"
             ;;
       esac
       echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
@@ -131,6 +137,9 @@ configureFluentDWorkerIDsForRS() {
       echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc
+
       source ~/.bashrc
 
       echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}"
@@ -138,6 +147,7 @@ configureFluentDWorkerIDsForRS() {
       echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}"
       echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
       echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
+      echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
 }
 
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
@@ -264,9 +274,6 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then
             export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password
             echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc
 
-            #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD
-            export MDSD_ODS_COMPRESSION_LEVEL=0
-            echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >>~/.bashrc
       fi
 
       if [ ! -z "$PROXY_ENDPOINT" ]; then
@@ -616,6 +623,13 @@ else
       echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >>~/.bashrc
       export MDSD_FLUENT_SOCKET_PORT="29230"
       echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc
+      # set the libcurl specific env and configuration
+      export ENABLE_CURL_UPLOAD=true
+      echo "export ENABLE_CURL_UPLOAD=$ENABLE_CURL_UPLOAD" >> ~/.bashrc
+      export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
+      echo "export CURL_CA_BUNDLE=$CURL_CA_BUNDLE" >> ~/.bashrc
+      mkdir -p /etc/pki/tls/certs
+      cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
 fi
 source ~/.bashrc
 
@@ -651,8 +665,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "*** starting fluentd v1 in daemonset"
             fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
-            echo "*** configure fluentd worker ids"
-            configureFluentDWorkerIDsForRS
+            echo "*** configure fluentd config"
+            configureFluentDConfigForRS
             echo "*** starting fluentd v1 in replicaset"
             fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       fi
diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh
index 872ac99cf..aca05fc08 100644
--- a/kubernetes/linux/setup.sh
+++ b/kubernetes/linux/setup.sh
@@ -9,8 +9,9 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
     dpkg-reconfigure --frontend=noninteractive locales && \
     update-locale LANG=en_US.UTF-8
 
-#install oneagent - Official bits (10/7/2021)
-wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.2-build.master.284_x86_64.deb
+#install oneagent - Official bits (02/15/2022)
+wget https://github.com/microsoft/Docker-Provider/releases/download/1.17.0/azure-mdsd_1.17.0-build.master.352_x86_64.deb
+
 
 /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb
 cp -f $TMPDIR/mdsd.xml /etc/mdsd.d
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index b0ccb6712..19f5afe92 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -357,7 +357,7 @@ spec:
         component: oms-agent
         tier: node
       annotations:
-        agentVersion: "azure-mdsd-1.14.2"
+        agentVersion: "azure-mdsd-1..17.0"
         dockerProviderVersion: "16.0.0-0"
         schema-versions: "v1"
     spec:
@@ -598,7 +598,7 @@ spec:
       labels:
         rsName: "omsagent-rs"
       annotations:
-        agentVersion: "azure-mdsd-1.14.2"
+        agentVersion: "azure-mdsd-1..17.0"
         dockerProviderVersion: "16.0.0-0"
         schema-versions: "v1"
     spec:
@@ -620,6 +620,8 @@ spec:
                 resourceFieldRef:
                   containerName: omsagent
                   resource: limits.cpu
+            # - name: MONITORING_MAX_EVENT_RATE
+            #   value: "50000" # default 20KPS for MDSD, for large cluster validate 50KPS
             - name: EMIT_CACHE_TELEMETRY
               value: "true" # enable only debug or test purpose and disable for prod
             - name: AKS_RESOURCE_ID
diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb
index a542c5eb0..fb66ec158 100644
--- a/source/plugins/ruby/out_mdm.rb
+++ b/source/plugins/ruby/out_mdm.rb
@@ -43,7 +43,6 @@ def initialize
 
       @data_hash = {}
       @parsed_token_uri = nil
-      @http_client = nil
       @token_expiry_time = Time.now
       @cached_access_token = String.new
       @last_post_attempt_time = Time.now
@@ -62,6 +61,7 @@ def initialize
       @mdm_exceptions_hash = {}
       @mdm_exceptions_count = 0
       @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
+      @proxy = nil
     end
 
     def configure(conf)
@@ -97,18 +97,7 @@ def start
           end
           @@post_request_url = @@post_request_url_template % { aks_region: aks_region, aks_resource_id: aks_resource_id }
           @post_request_uri = URI.parse(@@post_request_url)
-          if (!!@isArcK8sCluster)
-            proxy = (ProxyUtils.getProxyConfiguration)
-            if proxy.nil? || proxy.empty?
-              @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
-            else
-              @log.info "Proxy configured on this cluster: #{aks_resource_id}"
-              @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass])
-            end
-          else
-            @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
-          end
-          @http_client.use_ssl = true
+          @proxy = (ProxyUtils.getProxyConfiguration)
           @log.info "POST Request url: #{@@post_request_url}"
           ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {})
 
@@ -327,6 +316,13 @@ def send_to_mdm(post_body)
         else
           access_token = get_access_token
         end
+        if @proxy.nil? || @proxy.empty?
+          http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port)
+        else
+          @log.info "Proxy configured on this cluster: #{aks_resource_id}"
+          http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass])
+        end
+        http_client.use_ssl = true
         requestId = SecureRandom.uuid.to_s
         request = Net::HTTP::Post.new(@post_request_uri.request_uri)
         request["Content-Type"] = "application/x-ndjson"
@@ -335,7 +331,7 @@ def send_to_mdm(post_body)
 
         request.body = post_body.join("\n")
         @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}"
-        response = @http_client.request(request)
+        response = http_client.request(request)
         response.value # this throws for non 200 HTTP response code
         @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}"
         if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now

From 0297f7bf9e6b12077690d25946d8cef075f79e18 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 17 Feb 2022 23:42:23 -0800
Subject: [PATCH 36/65] rs specific config for large cluster

---
 build/linux/installer/conf/kube.conf          | 24 +++++------
 kubernetes/linux/main.sh                      | 41 ++++++++++++++-----
 .../plugins/ruby/in_kube_podmdminventory.rb   |  4 +-
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 016a7942a..50a917631 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -20,7 +20,7 @@
      @type file
      overflow_action drop_oldest_chunk
      chunk_limit_size 4m
-     queue_limit_length 20
+     queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
      retry_max_times 10
      retry_wait 5s
@@ -53,7 +53,7 @@
       @type file
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -89,7 +89,7 @@
       path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -115,7 +115,7 @@
       path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -150,7 +150,7 @@
       path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -181,7 +181,7 @@
       path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -200,7 +200,7 @@
       path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -235,7 +235,7 @@
       path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -262,7 +262,7 @@
       path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 50
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -349,7 +349,7 @@
       path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -376,7 +376,7 @@
       path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
@@ -421,7 +421,7 @@
       path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer
       overflow_action drop_oldest_chunk
       chunk_limit_size 4m
-      queue_limit_length 20
+      queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
       flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
       retry_max_times 10
       retry_wait 5s
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 80d13805c..752b85440 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -80,9 +80,10 @@ checkAgentOnboardingStatus() {
       fi
 }
 
-configureFluentDConfigForRS() {
+setReplicaSetSpecificConfig() {
       echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
-      export FLUENTD_FLUSH_INTERVAL="20s" # default 20s, evaluate if required lower flush interval at high scale
+      export FLUENTD_FLUSH_INTERVAL="20s"
+      export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default
       case $NUM_OF_FLUENTD_WORKERS in
       5)
             export NUM_OF_FLUENTD_WORKERS=5
@@ -91,7 +92,9 @@ configureFluentDConfigForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-            #export FLUENTD_FLUSH_INTERVAL="5s"
+            export FLUENTD_FLUSH_INTERVAL="5s"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="50"
+            export MONITORING_MAX_EVENT_RATE="50000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       4)
             export NUM_OF_FLUENTD_WORKERS=4
@@ -100,7 +103,9 @@ configureFluentDConfigForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=1
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-            #export FLUENTD_FLUSH_INTERVAL="10s"
+            export FLUENTD_FLUSH_INTERVAL="10s"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="40"
+            export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       3)
             export NUM_OF_FLUENTD_WORKERS=3
@@ -109,7 +114,9 @@ configureFluentDConfigForRS() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-           #export FLUENTD_FLUSH_INTERVAL="15s"
+            export FLUENTD_FLUSH_INTERVAL="15s"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="30"
+            export MONITORING_MAX_EVENT_RATE="30000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       2)
             export NUM_OF_FLUENTD_WORKERS=2
@@ -118,7 +125,9 @@ configureFluentDConfigForRS() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-            #export FLUENTD_FLUSH_INTERVAL="20s"
+            export FLUENTD_FLUSH_INTERVAL="20s"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="20"
+            export MONITORING_MAX_EVENT_RATE="25000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
 
       *)
@@ -128,7 +137,8 @@ configureFluentDConfigForRS() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
-            #export FLUENTD_FLUSH_INTERVAL="20s"
+            export FLUENTD_FLUSH_INTERVAL="20s"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="20"
             ;;
       esac
       echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
@@ -139,6 +149,12 @@ configureFluentDConfigForRS() {
       echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc
+      echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+
+      if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then
+        echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc
+        echo "Configured MDSD Max EPS is: ${MONITORING_MAX_EVENT_RATE}"
+      fi
 
       source ~/.bashrc
 
@@ -148,6 +164,7 @@ configureFluentDConfigForRS() {
       echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
       echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
       echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
+      echo "fluentd buffer plugin queue length=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
 }
 
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
@@ -581,6 +598,10 @@ echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION"
 export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION
 echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc
 
+if [ "${CONTROLLER_TYPE}" == "ReplicaSet" ]; then
+      echo "*** set applicable replicaset config ***"
+      setReplicaSetSpecificConfig
+fi
 #skip imds lookup since not used either legacy or aad msi auth path
 export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true"
 echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc
@@ -665,10 +686,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then
             echo "*** starting fluentd v1 in daemonset"
             fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       else
-            echo "*** configure fluentd config"
-            configureFluentDConfigForRS
-            echo "*** starting fluentd v1 in replicaset"
-            fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
+           echo "*** starting fluentd v1 in replicaset"
+           fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 &
       fi
 fi
 
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 98f06dc0c..714d78e07 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -112,9 +112,9 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
               end
             end
           end
-          @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm"
+          @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}"
           pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
-          @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}"
+          @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}"
           mdm_pod_inventory_es = Fluent::MultiEventStream.new
           pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
             mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record

From 89a96da17529cb20bd1c368a20165cbfd113c61a Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 21 Feb 2022 18:33:28 -0800
Subject: [PATCH 37/65] optimize out mdm

---
 build/linux/installer/conf/kube.conf          |  2 +-
 kubernetes/linux/main.sh                      | 25 +++++++++++--------
 source/plugins/ruby/constants.rb              |  2 ++
 .../plugins/ruby/in_kube_podmdminventory.rb   | 21 +++++++++-------
 source/plugins/ruby/out_mdm.rb                |  4 +++
 source/plugins/ruby/podinventory_to_mdm.rb    |  2 +-
 6 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 50a917631..1a566ec28 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -267,7 +267,7 @@
       retry_max_times 10
       retry_wait 5s
       retry_max_interval 5m
-      flush_thread_count 5
+      flush_thread_count "#{ENV['FLUENTD_MDM_FLUSH_THREAD_COUNT']}"
      </buffer>
      retry_mdm_post_wait_minutes 30
     </match>
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 752b85440..0f113905c 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -84,6 +84,7 @@ setReplicaSetSpecificConfig() {
       echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
       export FLUENTD_FLUSH_INTERVAL="20s"
       export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default
+      export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default
       case $NUM_OF_FLUENTD_WORKERS in
       5)
             export NUM_OF_FLUENTD_WORKERS=5
@@ -93,8 +94,9 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="5s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="50"
-            export MONITORING_MAX_EVENT_RATE="50000" # default MDSD EPS is 20K which is not enough for large scale
+            export FLUENTD_QUEUE_LIMIT_LENGTH="60"
+            export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale
+            export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker
             ;;
       4)
             export NUM_OF_FLUENTD_WORKERS=4
@@ -104,8 +106,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="10s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="40"
-            export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale
+            export FLUENTD_QUEUE_LIMIT_LENGTH="50"
+            export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       3)
             export NUM_OF_FLUENTD_WORKERS=3
@@ -115,8 +117,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="15s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="30"
-            export MONITORING_MAX_EVENT_RATE="30000" # default MDSD EPS is 20K which is not enough for large scale
+            export FLUENTD_QUEUE_LIMIT_LENGTH="40"
+            export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       2)
             export NUM_OF_FLUENTD_WORKERS=2
@@ -126,8 +128,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="20s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="20"
-            export MONITORING_MAX_EVENT_RATE="25000" # default MDSD EPS is 20K which is not enough for large scale
+            export FLUENTD_QUEUE_LIMIT_LENGTH="30"
+            export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
 
       *)
@@ -145,11 +147,11 @@ setReplicaSetSpecificConfig() {
       echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc
-      echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
-      echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
+      echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc
       echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+      echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc
 
       if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then
         echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc
@@ -164,7 +166,8 @@ setReplicaSetSpecificConfig() {
       echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
       echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
       echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
-      echo "fluentd buffer plugin queue length=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+      echo "fluentd out mdm flush thread count: $FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc
+      echo "fluentd buffer plugin queue length: $FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
 }
 
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb
index ca966fb12..883c6d15f 100644
--- a/source/plugins/ruby/constants.rb
+++ b/source/plugins/ruby/constants.rb
@@ -141,4 +141,6 @@ class Constants
   MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json"
   # FileName for NodeAllocatable Records state
   NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json"
+  # Emit Stream size for Pod MDM metric
+  POD_MDM_EMIT_STREAM_BATCH_SIZE = 5000 # each record is 200 bytes, 5k records ~2MB
 end
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 714d78e07..76b0eed0f 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -115,18 +115,21 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
           @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}"
           pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime)
           @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}"
-          mdm_pod_inventory_es = Fluent::MultiEventStream.new
-          pod_inventory_mdm_records.each { |pod_inventory_mdm_record|
-            mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record
-            if mdm_pod_inventory_es.count >= 5000 # 5k records of MDM is ~2MB and each record is ~400 bytes
+          if !pod_inventory_mdm_records.nil? && pod_inventory_mdm_records.length > 0
+            startTime = (Time.now.to_f * 1000).to_i
+            recordCount = pod_inventory_mdm_records.length
+            while recordCount > 0
+              record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)
+              time_array = Array.new(records.length) { batchTime }
+              mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array)
               router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
-              mdm_pod_inventory_es = Fluent::MultiEventStream.new
+              pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)
+              recordCount = pod_inventory_mdm_records.length
+              time_array = nil
             end
-          } if pod_inventory_mdm_records
-          if mdm_pod_inventory_es.count > 0
-            router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
+            flushTimeMs = (Time.now.to_f * 1000).to_i - startTime
+            @log.info "in_kube_podmdminventory:parse_and_emit_records:timetaken to flush all Pod MDM records: #{flushTimeMs} @ #{Time.now.utc.iso8601}"
           end
-          mdm_pod_inventory_es = nil
         end
       rescue => errorStr
         $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}"
diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb
index fb66ec158..dd60a250b 100644
--- a/source/plugins/ruby/out_mdm.rb
+++ b/source/plugins/ruby/out_mdm.rb
@@ -141,6 +141,10 @@ def start
       end
     end
 
+    def multi_workers_ready?
+      return true
+    end
+
     # get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired
     def get_access_token
       if (Time.now > @get_access_token_backoff_expiry)
diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb
index 278632cb0..a7f9c5435 100644
--- a/source/plugins/ruby/podinventory_to_mdm.rb
+++ b/source/plugins/ruby/podinventory_to_mdm.rb
@@ -129,7 +129,7 @@ def get_pod_inventory_mdm_records(batch_time)
             controllerNameDimValue: podControllerNameDimValue,
             podCountMetricValue: value,
           }
-          records.push(JSON.parse(record))
+          records.push(Yajl::Parser.parse(record))
         }
 
         #Add pod metric records

From a6d04c5815057ae6ca1ce66b475b2d6efd8cb3aa Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 21 Feb 2022 21:18:07 -0800
Subject: [PATCH 38/65] bug fix

---
 kubernetes/linux/main.sh                      |  4 ++--
 source/plugins/ruby/in_kube_podinventory.rb   | 21 +++++++++++--------
 .../plugins/ruby/in_kube_podmdminventory.rb   | 14 +++++++++----
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 0f113905c..701b63b17 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -166,8 +166,8 @@ setReplicaSetSpecificConfig() {
       echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
       echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
       echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
-      echo "fluentd out mdm flush thread count: $FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc
-      echo "fluentd buffer plugin queue length: $FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
+      echo "fluentd buffer plugin queue length: ${FLUENTD_QUEUE_LIMIT_LENGTH}"
+      echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}"
 }
 
 #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 455444f85..28d11e29a 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -26,6 +26,7 @@ def initialize
       require_relative "omslog"
       require_relative "constants"
       require_relative "extension_utils"
+      require_relative "CustomMetricsUtils"
 
       # refer tomlparser-agent-config for updating defaults
       # this configurable via configmap
@@ -348,16 +349,18 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
         end
 
         if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send
-          begin
-            if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
-              mdmPodRecordsJson = @mdmPodRecords.to_json
-              @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
-              @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
-              writeMDMRecords(mdmPodRecordsJson)
-              @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
+          if CustomMetricsUtils.check_custom_metrics_availability
+            begin
+              if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
+                mdmPodRecordsJson = @mdmPodRecords.to_json
+                @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
+                @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
+                writeMDMRecords(mdmPodRecordsJson)
+                @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
+              end
+            rescue => err
+              @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}"
             end
-          rescue => err
-            @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}"
           end
         end
 
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 76b0eed0f..8272420c3 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -24,6 +24,7 @@ def initialize
       require_relative "oms_common"
       require_relative "omslog"
       require_relative "constants"
+      require_relative "CustomMetricsUtils"
     end
 
     config_param :run_interval, :time, :default => 60
@@ -37,6 +38,7 @@ def start
       if @run_interval
         super
         $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}")
+        @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability
         @finished = false
         @condition = ConditionVariable.new
         @mutex = Mutex.new
@@ -57,9 +59,13 @@ def shutdown
 
     def enumerate
       begin
-        currentTime = Time.now
-        batchTime = currentTime.utc.iso8601
-        parse_and_emit_records(batchTime)
+        if !@isCustomMetricsAvailability
+          $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region"
+        else
+          currentTime = Time.now
+          batchTime = currentTime.utc.iso8601
+          parse_and_emit_records(batchTime)
+        end
       rescue => errorStr
         $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}"
         $log.debug_backtrace(errorStr.backtrace)
@@ -120,7 +126,7 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601)
             recordCount = pod_inventory_mdm_records.length
             while recordCount > 0
               record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)
-              time_array = Array.new(records.length) { batchTime }
+              time_array = Array.new(record_array.length) { batchTime }
               mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array)
               router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es)
               pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE)

From 4c4d2e6b3f00c15f8e7a8e56a89cffec3e71e995 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 22 Feb 2022 17:55:51 -0800
Subject: [PATCH 39/65] use large queue limit for kube perf

---
 build/linux/installer/conf/kube.conf |  2 +-
 kubernetes/linux/main.sh             | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf
index 1a566ec28..1b68f990e 100644
--- a/build/linux/installer/conf/kube.conf
+++ b/build/linux/installer/conf/kube.conf
@@ -20,7 +20,7 @@
      @type file
      overflow_action drop_oldest_chunk
      chunk_limit_size 4m
-     queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}"
+     queue_limit_length "#{ENV['FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH']}"
      flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}"
      retry_max_times 10
      retry_wait 5s
diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index 701b63b17..fb824c6bb 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -84,6 +84,7 @@ setReplicaSetSpecificConfig() {
       echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}"
       export FLUENTD_FLUSH_INTERVAL="20s"
       export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default
+      export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20"
       export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default
       case $NUM_OF_FLUENTD_WORKERS in
       5)
@@ -94,7 +95,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="5s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="60"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="50"
+            export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="100" # kube perf is high volume so would need large queue limit to avoid data loss
             export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale
             export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker
             ;;
@@ -106,7 +108,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="10s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="50"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="40"
+            export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="80" # kube perf is high volume so would need large queue limit
             export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       3)
@@ -117,7 +120,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="15s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="40"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="30"
+            export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="60" # kube perf is high volume so would need large queue limit
             export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
       2)
@@ -128,7 +132,8 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=0
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="20s"
-            export FLUENTD_QUEUE_LIMIT_LENGTH="30"
+            export FLUENTD_QUEUE_LIMIT_LENGTH="20"
+            export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="40" # kube perf is high volume so would need large queue limit
             export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale
             ;;
 
@@ -141,6 +146,7 @@ setReplicaSetSpecificConfig() {
             export FLUENTD_OTHER_INVENTORY_WORKER_ID=0
             export FLUENTD_FLUSH_INTERVAL="20s"
             export FLUENTD_QUEUE_LIMIT_LENGTH="20"
+            export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20"
             ;;
       esac
       echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc
@@ -150,6 +156,7 @@ setReplicaSetSpecificConfig() {
       echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc
       echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc
+      echo "export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH=$FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH" >>~/.bashrc
       echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc
       echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc
 
@@ -166,7 +173,8 @@ setReplicaSetSpecificConfig() {
       echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}"
       echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}"
       echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}"
-      echo "fluentd buffer plugin queue length: ${FLUENTD_QUEUE_LIMIT_LENGTH}"
+      echo "fluentd kube perf buffer plugin queue length: ${FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH}"
+      echo "fluentd buffer plugin queue length for all other non kube perf plugin: ${FLUENTD_QUEUE_LIMIT_LENGTH}"
       echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}"
 }
 

From 333cd8081d27495451c5518bd7f6a76a01ce52d0 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 22 Feb 2022 19:00:32 -0800
Subject: [PATCH 40/65] 5k preview rs limits

---
 kubernetes/omsagent.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 19f5afe92..88ad931b1 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -610,7 +610,7 @@ spec:
           resources:
             limits:
               cpu: 5
-              memory: 2Gi
+              memory: 5Gi
             requests:
               cpu: 150m
               memory: 250Mi

From fb57c3c406ebffaa2fe47ce784f4f8b5c612f8fc Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 23 Feb 2022 16:47:53 -0800
Subject: [PATCH 41/65] handle resourceversion empty or 0 scenrio

---
 source/plugins/ruby/in_kube_nodes.rb         | 117 +++++----
 source/plugins/ruby/in_kube_perfinventory.rb | 118 +++++----
 source/plugins/ruby/in_kube_podinventory.rb  | 257 ++++++++++---------
 3 files changed, 264 insertions(+), 228 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index d3077e713..8ee2e5fc2 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -657,68 +657,77 @@ def watch_nodes
               end
             end
           end
-          begin
-            $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                    nodesResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                    break
-                  end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                      if !nodeItem.nil? && !nodeItem.empty?
+          if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
+            # https://github.com/kubernetes/kubernetes/issues/74022
+            $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion: #{nodesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil # for the LIST to happen again
+            sleep(30) # do not overwhelm the api-server if api-server broken
+          else
+            begin
+              $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+              watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+              if watcher.nil?
+                $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+              else
+                watcher.each do |notice|
+                  case notice["type"]
+                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                    item = notice["object"]
+                    # extract latest resource version to use for watch reconnect
+                    if !item.nil? && !item.empty? &&
+                       !item["metadata"].nil? && !item["metadata"].empty? &&
+                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                      nodesResourceVersion = item["metadata"]["resourceVersion"]
+                      # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    else
+                      $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                      nodesResourceVersion = nil
+                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                      break
+                    end
+                    if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                        if !nodeItem.nil? && !nodeItem.empty?
+                          @nodeCacheMutex.synchronize {
+                            @nodeItemsCache[key] = nodeItem
+                          }
+                        else
+                          $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      else
+                        $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    elsif notice["type"] == "DELETED"
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
                         @nodeCacheMutex.synchronize {
-                          @nodeItemsCache[key] = nodeItem
+                          @nodeItemsCache.delete(key)
                         }
-                      else
-                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                       end
-                    else
-                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      @nodeCacheMutex.synchronize {
-                        @nodeItemsCache.delete(key)
-                      }
                     end
+                  when "ERROR"
+                    nodesResourceVersion = nil
+                    $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                    break
+                  else
+                    nodesResourceVersion = nil
+                    $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                    break
                   end
-                when "ERROR"
-                  nodesResourceVersion = nil
-                  $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
+            rescue Net::ReadTimeout => errorStr
+              ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+              # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            rescue => errorStr
+              $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              nodesResourceVersion = nil
+              sleep(5) # do not overwhelm the api-server if api-server broken
+            ensure
+              watcher.finish if watcher
             end
-          rescue Net::ReadTimeout => errorStr
-            ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
-            # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            nodesResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
           end
         rescue => errorStr
           $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 00f7b02db..b6abbc263 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -305,69 +305,77 @@ def watch_pods
               end
             end
           end
-          begin
-            $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    podsResourceVersion = item["metadata"]["resourceVersion"]
-                    # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                    podsResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                    break
-                  end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
-                      if !podItem.nil? && !podItem.empty?
+          if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0"
+            # https://github.com/kubernetes/kubernetes/issues/74022
+            $log.warn("in_kube_perfinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            podsResourceVersion = nil # for the LIST to happen again
+            sleep(30) # do not overwhelm the api-server if api-server broken
+          else
+            begin
+              $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+              if watcher.nil?
+                $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              else
+                watcher.each do |notice|
+                  case notice["type"]
+                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                    item = notice["object"]
+                    # extract latest resource version to use for watch reconnect
+                    if !item.nil? && !item.empty? &&
+                       !item["metadata"].nil? && !item["metadata"].empty? &&
+                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                      podsResourceVersion = item["metadata"]["resourceVersion"]
+                      # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                    else
+                      $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                      podsResourceVersion = nil
+                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                      break
+                    end
+                    if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
+                        if !podItem.nil? && !podItem.empty?
+                          @podCacheMutex.synchronize {
+                            @podItemsCache[key] = podItem
+                          }
+                        else
+                          $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                        end
+                      else
+                        $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    elsif notice["type"] == "DELETED"
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
                         @podCacheMutex.synchronize {
-                          @podItemsCache[key] = podItem
+                          @podItemsCache.delete(key)
                         }
-                      else
-                        $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
                       end
-                    else
-                      $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      @podCacheMutex.synchronize {
-                        @podItemsCache.delete(key)
-                      }
                     end
+                  when "ERROR"
+                    podsResourceVersion = nil
+                    $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                    break
+                  else
+                    podsResourceVersion = nil
+                    $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                   end
-                when "ERROR"
-                  podsResourceVersion = nil
-                  $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
+                $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
               end
-              $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            rescue Net::ReadTimeout => errorStr
+              ## This expected if there is no activity more than readtimeout value used in the connection
+              # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            rescue => errorStr
+              $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              podsResourceVersion = nil
+              sleep(5) # do not overwhelm the api-server if api-server broken
+            ensure
+              watcher.finish if watcher
             end
-          rescue Net::ReadTimeout => errorStr
-            ## This expected if there is no activity more than readtimeout value used in the connection
-            # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            podsResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
           end
         rescue => errorStr
           $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 28d11e29a..c44943b0e 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -787,81 +787,91 @@ def watch_pods
               end
             end
           end
-          begin
-            $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    podsResourceVersion = item["metadata"]["resourceVersion"]
-                    # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                    podsResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                    break
-                  end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      currentWindowsNodeNameList = []
-                      @windowsNodeNameCacheMutex.synchronize {
-                        currentWindowsNodeNameList = @windowsNodeNameListCache.dup
-                      }
-                      isWindowsPodItem = false
-                      nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
-                      if !nodeName.empty? &&
-                         !currentWindowsNodeNameList.nil? &&
-                         !currentWindowsNodeNameList.empty? &&
-                         currentWindowsNodeNameList.include?(nodeName)
-                        isWindowsPodItem = true
+          if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0"
+            # https://github.com/kubernetes/kubernetes/issues/74022
+            $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            podsResourceVersion = nil # for the LIST to happen again
+            sleep(30) # do not overwhelm the api-server if api-server broken
+          else
+            begin
+              $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true)
+              if watcher.nil?
+                $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+              else
+                watcher.each do |notice|
+                  case notice["type"]
+                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                    item = notice["object"]
+                    # extract latest resource version to use for watch reconnect
+                    if !item.nil? && !item.empty? &&
+                       !item["metadata"].nil? && !item["metadata"].empty? &&
+                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                      podsResourceVersion = item["metadata"]["resourceVersion"]
+                      # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                    else
+                      $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                      podsResourceVersion = nil
+                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                      break
+                    end
+                    if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        currentWindowsNodeNameList = []
+                        @windowsNodeNameCacheMutex.synchronize {
+                          currentWindowsNodeNameList = @windowsNodeNameListCache.dup
+                        }
+                        isWindowsPodItem = false
+                        nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+                        if !nodeName.empty? &&
+                           !currentWindowsNodeNameList.nil? &&
+                           !currentWindowsNodeNameList.empty? &&
+                           currentWindowsNodeNameList.include?(nodeName)
+                          isWindowsPodItem = true
+                        end
+                        podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
+                        if !podItem.nil? && !podItem.empty?
+                          @podCacheMutex.synchronize {
+                            @podItemsCache[key] = podItem
+                          }
+                        else
+                          $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                        end
+                      else
+                        $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
                       end
-                      podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
-                      if !podItem.nil? && !podItem.empty?
+                    elsif notice["type"] == "DELETED"
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
                         @podCacheMutex.synchronize {
-                          @podItemsCache[key] = podItem
+                          @podItemsCache.delete(key)
                         }
-                      else
-                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
                       end
-                    else
-                      $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      @podCacheMutex.synchronize {
-                        @podItemsCache.delete(key)
-                      }
                     end
+                  when "ERROR"
+                    podsResourceVersion = nil
+                    $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                    break
+                  else
+                    $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                    # enforce LIST again otherwise cause inconsistency by skipping a potential RV with valid data!
+                    podsResourceVersion = nil
+                    break
                   end
-                when "ERROR"
-                  podsResourceVersion = nil
-                  $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
+                $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
               end
-              $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+            rescue Net::ReadTimeout => errorStr
+              ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+              # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            rescue => errorStr
+              $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              podsResourceVersion = nil
+              sleep(5) # do not overwhelm the api-server if api-server broken
+            ensure
+              watcher.finish if watcher
             end
-          rescue Net::ReadTimeout => errorStr
-            ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
-            # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            podsResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
           end
         rescue => errorStr
           $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
@@ -915,67 +925,76 @@ def watch_services
               serviceInventory = nil
             end
           end
-          begin
-            $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    servicesResourceVersion = item["metadata"]["resourceVersion"]
-                    # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                    servicesResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                    break
-                  end
-                  if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
-                      if !serviceItem.nil? && !serviceItem.empty?
+          if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0"
+            # https://github.com/kubernetes/kubernetes/issues/74022
+            $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion: #{servicesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            servicesResourceVersion = nil # for the LIST to happen again
+            sleep(30) # do not overwhelm the api-server if api-server broken
+          else
+            begin
+              $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+              watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true)
+              if watcher.nil?
+                $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+              else
+                watcher.each do |notice|
+                  case notice["type"]
+                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                    item = notice["object"]
+                    # extract latest resource version to use for watch reconnect
+                    if !item.nil? && !item.empty? &&
+                       !item["metadata"].nil? && !item["metadata"].empty? &&
+                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                      servicesResourceVersion = item["metadata"]["resourceVersion"]
+                      # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    else
+                      $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                      servicesResourceVersion = nil
+                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                      break
+                    end
+                    if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                        if !serviceItem.nil? && !serviceItem.empty?
+                          @serviceCacheMutex.synchronize {
+                            @serviceItemsCache[key] = serviceItem
+                          }
+                        else
+                          $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      else
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    elsif notice["type"] == "DELETED"
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
                         @serviceCacheMutex.synchronize {
-                          @serviceItemsCache[key] = serviceItem
+                          @serviceItemsCache.delete(key)
                         }
-                      else
-                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
                       end
-                    else
-                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      @serviceCacheMutex.synchronize {
-                        @serviceItemsCache.delete(key)
-                      }
                     end
+                  when "ERROR"
+                    servicesResourceVersion = nil
+                    $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                    break
+                  else
+                    servicesResourceVersion = nil
+                    $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                    break
                   end
-                when "ERROR"
-                  servicesResourceVersion = nil
-                  $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
+            rescue Net::ReadTimeout => errorStr
+              # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            rescue => errorStr
+              $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              servicesResourceVersion = nil
+              sleep(5) # do not overwhelm the api-server if api-server broken
+            ensure
+              watcher.finish if watcher
             end
-          rescue Net::ReadTimeout => errorStr
-            # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            servicesResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
           end
         rescue => errorStr
           $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")

From 86f088e99a69af374ebaeab7733458aa6e9a0bbb Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 25 Feb 2022 00:18:27 -0800
Subject: [PATCH 42/65] handle pagination api call failures

---
 source/plugins/ruby/KubernetesApiClient.rb   |  62 +++-
 source/plugins/ruby/in_kube_nodes.rb         |  70 ++--
 source/plugins/ruby/in_kube_perfinventory.rb |  74 +++--
 source/plugins/ruby/in_kube_podinventory.rb  | 317 ++++++++++---------
 4 files changed, 318 insertions(+), 205 deletions(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 0d4267685..7f8cd0498 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -75,6 +75,39 @@ def getKubeResourceInfo(resource, api_group: nil)
       return response
     end
 
+    def getKubeResourceInfoV2(resource, api_group: nil)
+      headers = {}
+      response = nil
+      responseCode = nil
+      @Log.info "Getting Kube resource: #{resource}"
+      begin
+        resourceUri = getResourceUri(resource, api_group)
+        if !resourceUri.nil?
+          uri = URI.parse(resourceUri)
+          if !File.exist?(@@CaFile)
+            raise "#{@@CaFile} doesnt exist"
+          else
+            Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http|
+              kubeApiRequest = Net::HTTP::Get.new(uri.request_uri)
+              kubeApiRequest["Authorization"] = "Bearer " + getTokenStr
+              @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+              response = http.request(kubeApiRequest)
+              responseCode = response.code
+              @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}"
+            end
+          end
+        end
+      rescue => error
+        @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}")
+      end
+      if (!response.nil?)
+        if (!response.body.nil? && response.body.empty?)
+          @Log.warn("KubernetesAPIClient::getKubeResourceInfoV2 : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}")
+        end
+      end
+      return responseCode, response
+    end
+
     def getTokenStr
       return @@TokenStr if !@@TokenStr.nil?
       begin
@@ -759,12 +792,37 @@ def getMetricNumericValue(metricName, metricVal)
       return metricValue
     end # getMetricNumericValue
 
+    def getResourcesAndContinuationTokenV2(uri, api_group: nil)
+      continuationToken = nil
+      resourceInventory = nil
+      responseCode = nil
+      begin
+        @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+        responseCode, resourceInfo = getKubeResourceInfoV2(uri, api_group: api_group)
+        @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
+        if !responseCode.nil? && responseCode == "200" && !resourceInfo.nil?
+          @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+          resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body))
+          @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
+          resourceInfo = nil
+        end
+        if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?)
+          continuationToken = resourceInventory["metadata"]["continue"]
+        end
+      rescue => errorStr
+        @Log.warn "KubernetesApiClient::getResourcesAndContinuationTokenV2:Failed in get resources for #{uri} and continuation token: #{errorStr}"
+        ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+        resourceInventory = nil
+      end
+      return continuationToken, resourceInventory, responseCode
+    end #getResourcesAndContinuationTokenV2
+
     def getResourcesAndContinuationToken(uri, api_group: nil)
       continuationToken = nil
       resourceInventory = nil
       begin
         @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
-        resourceInfo = getKubeResourceInfo(uri, api_group: api_group)
+        responseCode, resourceInfo = getKubeResourceInfo(uri, api_group: api_group)
         @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
         if !resourceInfo.nil?
           @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"
@@ -1107,7 +1165,7 @@ def getPodOptimizedItem(resourceItem, isWindowsPodItem)
               currentContainerStatus["restartCount"] = containerStatus["restartCount"]
               currentContainerStatus["state"] = containerStatus["state"]
               currentContainerStatus["lastState"] = containerStatus["lastState"]
-              if isWindowsPod
+              if isWindowsPodItem
                 currentContainerStatus["imageID"] = containerStatus["imageID"]
               end
               item["status"]["initContainerStatuses"].push(currentContainerStatus)
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 8ee2e5fc2..121b1804f 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -603,39 +603,17 @@ def watch_nodes
               @nodeItemsCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
             resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
-            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-            $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}")
-            if (!nodeInventory.nil? && !nodeInventory.empty?)
-              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                nodeInventory["items"].each do |item|
-                  key = item["metadata"]["uid"]
-                  if !key.nil? && !key.empty?
-                    nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                    if !nodeItem.nil? && !nodeItem.empty?
-                      @nodeCacheMutex.synchronize {
-                        @nodeItemsCache[key] = nodeItem
-                      }
-                    else
-                      $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
-                    end
-                  else
-                    $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
+            $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+            continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+            if responseCode.nil? || responseCode != "200"
+              $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
             else
-              $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
-            end
-            while (!continuationToken.nil? && !continuationToken.empty?)
-              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
+              $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
               if (!nodeInventory.nil? && !nodeInventory.empty?)
                 nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
                 if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                  $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                   nodeInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
@@ -653,13 +631,45 @@ def watch_nodes
                   end
                 end
               else
-                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+              end
+              while (!continuationToken.nil? && !continuationToken.empty?)
+                continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
+                if responseCode.nil? || responseCode != "200"
+                  $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+                  nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
+                  break
+                else
+                  if (!nodeInventory.nil? && !nodeInventory.empty?)
+                    nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                    if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                      $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                      nodeInventory["items"].each do |item|
+                        key = item["metadata"]["uid"]
+                        if !key.nil? && !key.empty?
+                          nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                          if !nodeItem.nil? && !nodeItem.empty?
+                            @nodeCacheMutex.synchronize {
+                              @nodeItemsCache[key] = nodeItem
+                            }
+                          else
+                            $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                          end
+                        else
+                          $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      end
+                    end
+                  else
+                    $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+                  end
+                end
               end
             end
           end
           if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
             # https://github.com/kubernetes/kubernetes/issues/74022
-            $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion: #{nodesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
             nodesResourceVersion = nil # for the LIST to happen again
             sleep(30) # do not overwhelm the api-server if api-server broken
           else
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index b6abbc263..50552a25d 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -252,38 +252,17 @@ def watch_pods
               @podItemsCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
-            continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
-            $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
-            if (!podInventory.nil? && !podInventory.empty?)
-              podsResourceVersion = podInventory["metadata"]["resourceVersion"]
-              if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                podInventory["items"].each do |item|
-                  key = item["metadata"]["uid"]
-                  if !key.nil? && !key.empty?
-                    podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
-                    if !podItem.nil? && !podItem.empty?
-                      @podCacheMutex.synchronize {
-                        @podItemsCache[key] = podItem
-                      }
-                    else
-                      $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
-                    end
-                  else
-                    $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
+            resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}"
+            $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri}  @ #{Time.now.utc.iso8601}")
+            continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+            if responseCode.nil? || responseCode != "200"
+              $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
             else
-              $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory"
-            end
-            while (!continuationToken.nil? && !continuationToken.empty?)
-              continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
+              $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API:#{resourceUri} @ #{Time.now.utc.iso8601}")
               if (!podInventory.nil? && !podInventory.empty?)
                 podsResourceVersion = podInventory["metadata"]["resourceVersion"]
                 if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                  $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
@@ -293,7 +272,7 @@ def watch_pods
                           @podItemsCache[key] = podItem
                         }
                       else
-                        $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                        $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
                       end
                     else
                       $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
@@ -301,7 +280,40 @@ def watch_pods
                   end
                 end
               else
-                $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
+                $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory"
+              end
+              while (!continuationToken.nil? && !continuationToken.empty?)
+                resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}"
+                continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+                if responseCode.nil? || responseCode != "200"
+                  $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+                  podsResourceVersion = nil
+                  break  # break, if any of the pagination call failed so that full cache will rebuild with LIST again
+                else
+                  if (!podInventory.nil? && !podInventory.empty?)
+                    podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+                    if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                      $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                      podInventory["items"].each do |item|
+                        key = item["metadata"]["uid"]
+                        if !key.nil? && !key.empty?
+                          podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item)
+                          if !podItem.nil? && !podItem.empty?
+                            @podCacheMutex.synchronize {
+                              @podItemsCache[key] = podItem
+                            }
+                          else
+                            $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                          end
+                        else
+                          $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      end
+                    end
+                  else
+                    $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
+                  end
+                end
               end
             end
           end
@@ -364,7 +376,7 @@ def watch_pods
                     $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                   end
                 end
-                $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}")
               end
             rescue Net::ReadTimeout => errorStr
               ## This expected if there is no activity more than readtimeout value used in the connection
diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index c44943b0e..68704c4d3 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -718,46 +718,17 @@ def watch_pods
               currentWindowsNodeNameList = @windowsNodeNameListCache.dup
             }
             continuationToken = nil
-            $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion}  @ #{Time.now.utc.iso8601}")
-            continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}")
-            $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}")
-            if (!podInventory.nil? && !podInventory.empty?)
-              podsResourceVersion = podInventory["metadata"]["resourceVersion"]
-              if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                podInventory["items"].each do |item|
-                  key = item["metadata"]["uid"]
-                  if !key.nil? && !key.empty?
-                    nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
-                    isWindowsPodItem = false
-                    if !nodeName.empty? &&
-                       !currentWindowsNodeNameList.nil? &&
-                       !currentWindowsNodeNameList.empty? &&
-                       currentWindowsNodeNameList.include?(nodeName)
-                      isWindowsPodItem = true
-                    end
-                    podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
-                    if !podItem.nil? && !podItem.empty?
-                      @podCacheMutex.synchronize {
-                        @podItemsCache[key] = podItem
-                      }
-                    else
-                      $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
-                    end
-                  else
-                    $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
+            resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}"
+            $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+            continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+            if responseCode.nil? || responseCode != "200"
+              $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
             else
-              $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
-            end
-            while (!continuationToken.nil? && !continuationToken.empty?)
-              continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}")
+              $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
               if (!podInventory.nil? && !podInventory.empty?)
                 podsResourceVersion = podInventory["metadata"]["resourceVersion"]
                 if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                   podInventory["items"].each do |item|
                     key = item["metadata"]["uid"]
                     if !key.nil? && !key.empty?
@@ -775,7 +746,7 @@ def watch_pods
                           @podItemsCache[key] = podItem
                         }
                       else
-                        $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                        $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil  @ #{Time.now.utc.iso8601}"
                       end
                     else
                       $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
@@ -783,15 +754,56 @@ def watch_pods
                   end
                 end
               else
-                $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
+                $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory"
+              end
+              while (!continuationToken.nil? && !continuationToken.empty?)
+                resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}"
+                continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
+                if responseCode.nil? || responseCode != "200"
+                  $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+                  podsResourceVersion = nil
+                  break  # break, if any of the pagination call failed so that full cache will rebuild with LIST again
+                else
+                  if (!podInventory.nil? && !podInventory.empty?)
+                    podsResourceVersion = podInventory["metadata"]["resourceVersion"]
+                    if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?)
+                      $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                      podInventory["items"].each do |item|
+                        key = item["metadata"]["uid"]
+                        if !key.nil? && !key.empty?
+                          nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : ""
+                          isWindowsPodItem = false
+                          if !nodeName.empty? &&
+                             !currentWindowsNodeNameList.nil? &&
+                             !currentWindowsNodeNameList.empty? &&
+                             currentWindowsNodeNameList.include?(nodeName)
+                            isWindowsPodItem = true
+                          end
+                          podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem)
+                          if !podItem.nil? && !podItem.empty?
+                            @podCacheMutex.synchronize {
+                              @podItemsCache[key] = podItem
+                            }
+                          else
+                            $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil  @ #{Time.now.utc.iso8601}"
+                          end
+                        else
+                          $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      end
+                    end
+                  else
+                    $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory  @ #{Time.now.utc.iso8601}"
+                  end
+                end
               end
             end
           end
           if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0"
             # https://github.com/kubernetes/kubernetes/issues/74022
-            $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
             podsResourceVersion = nil # for the LIST to happen again
-            sleep(30) # do not overwhelm the api-server if api-server broken
+            sleep(30) # do not overwhelm the api-server if api-server down
           else
             begin
               $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
@@ -860,7 +872,7 @@ def watch_pods
                     break
                   end
                 end
-                $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}")
+                $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}")
               end
             rescue Net::ReadTimeout => errorStr
               ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
@@ -868,7 +880,7 @@ def watch_pods
             rescue => errorStr
               $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
               podsResourceVersion = nil
-              sleep(5) # do not overwhelm the api-server if api-server broken
+              sleep(5) # do not overwhelm the api-server if api-server down
             ensure
               watcher.finish if watcher
             end
@@ -892,44 +904,48 @@ def watch_services
               @serviceItemsCache.clear()
             }
             $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}")
-            serviceInfo = KubernetesApiClient.getKubeResourceInfo("services")
-            $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}")
-            if !serviceInfo.nil?
-              $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
-              serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
-              $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
-              serviceInfo = nil
-              if (!serviceInventory.nil? && !serviceInventory.empty?)
-                servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"]
-                if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
-                  serviceInventory["items"].each do |item|
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
-                      if !serviceItem.nil? && !serviceItem.empty?
-                        @serviceCacheMutex.synchronize {
-                          @serviceItemsCache[key] = serviceItem
-                        }
+            responseCode, serviceInfo = KubernetesApiClient.getKubeResourceInfoV2("services")
+            if responseCode.nil? || responseCode != "200"
+              $log.info("in_kube_podinventory::watch_services:Getting services from Kube API failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+            else
+              $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}")
+              if !serviceInfo.nil?
+                $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+                serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body))
+                $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}")
+                serviceInfo = nil
+                if (!serviceInventory.nil? && !serviceInventory.empty?)
+                  servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"]
+                  if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?)
+                    $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}")
+                    serviceInventory["items"].each do |item|
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        serviceItem = KubernetesApiClient.getOptimizedItem("services", item)
+                        if !serviceItem.nil? && !serviceItem.empty?
+                          @serviceCacheMutex.synchronize {
+                            @serviceItemsCache[key] = serviceItem
+                          }
+                        else
+                          $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
                       else
-                        $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty  @ #{Time.now.utc.iso8601}"
+                        $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
                       end
-                    else
-                      $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty  @ #{Time.now.utc.iso8601}"
                     end
                   end
+                else
+                  $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory  @ #{Time.now.utc.iso8601}"
                 end
-              else
-                $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory  @ #{Time.now.utc.iso8601}"
+                serviceInventory = nil
               end
-              serviceInventory = nil
             end
           end
           if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0"
             # https://github.com/kubernetes/kubernetes/issues/74022
-            $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion: #{servicesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
             servicesResourceVersion = nil # for the LIST to happen again
-            sleep(30) # do not overwhelm the api-server if api-server broken
+            sleep(30) # do not overwhelm the api-server if api-server down
           else
             begin
               $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}")
@@ -991,7 +1007,7 @@ def watch_services
             rescue => errorStr
               $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
               servicesResourceVersion = nil
-              sleep(5) # do not overwhelm the api-server if api-server broken
+              sleep(5) # do not overwhelm the api-server if api-server down
             ensure
               watcher.finish if watcher
             end
@@ -1014,36 +1030,17 @@ def watch_windows_nodes
               @windowsNodeNameListCache.clear()
             }
             continuationToken = nil
-            $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
             resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}")
-            continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
-            $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}")
-            if (!nodeInventory.nil? && !nodeInventory.empty?)
-              nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-              if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                nodeInventory["items"].each do |item|
-                  key = item["metadata"]["name"]
-                  if !key.nil? && !key.empty?
-                    @windowsNodeNameCacheMutex.synchronize {
-                      if !@windowsNodeNameListCache.include?(key)
-                        @windowsNodeNameListCache.push(key)
-                      end
-                    }
-                  else
-                    $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
+            $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+            continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+            if responseCode.nil? || responseCode != "200"
+              $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
             else
-              $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
-            end
-            while (!continuationToken.nil? && !continuationToken.empty?)
-              continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}")
+              $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}")
               if (!nodeInventory.nil? && !nodeInventory.empty?)
                 nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
                 if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                  $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                  $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
                   nodeInventory["items"].each do |item|
                     key = item["metadata"]["name"]
                     if !key.nil? && !key.empty?
@@ -1060,61 +1057,97 @@ def watch_windows_nodes
               else
                 $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
               end
+              while (!continuationToken.nil? && !continuationToken.empty?)
+                continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
+                if responseCode.nil? || responseCode != "200"
+                  $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+                  nodesResourceVersion = nil
+                  break # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
+                else
+                  if (!nodeInventory.nil? && !nodeInventory.empty?)
+                    nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                    if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                      $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                      nodeInventory["items"].each do |item|
+                        key = item["metadata"]["name"]
+                        if !key.nil? && !key.empty?
+                          @windowsNodeNameCacheMutex.synchronize {
+                            if !@windowsNodeNameListCache.include?(key)
+                              @windowsNodeNameListCache.push(key)
+                            end
+                          }
+                        else
+                          $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
+                      end
+                    end
+                  else
+                    $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
+                  end
+                end
+              end
             end
           end
-          begin
-            $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
-            if watcher.nil?
-              $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-            else
-              watcher.each do |notice|
-                case notice["type"]
-                when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                  item = notice["object"]
-                  # extract latest resource version to use for watch reconnect
-                  if !item.nil? && !item.empty? &&
-                     !item["metadata"].nil? && !item["metadata"].empty? &&
-                     !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                    nodesResourceVersion = item["metadata"]["resourceVersion"]
-                    # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-                  else
-                    $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+          if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
+            # https://github.com/kubernetes/kubernetes/issues/74022
+            $log.warn("in_kube_podinventory::watch_windows_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil # for the LIST to happen again
+            sleep(30) # do not overwhelm the api-server if api-server down
+          else
+            begin
+              $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+              watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+              if watcher.nil?
+                $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+              else
+                watcher.each do |notice|
+                  case notice["type"]
+                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                    item = notice["object"]
+                    # extract latest resource version to use for watch reconnect
+                    if !item.nil? && !item.empty? &&
+                       !item["metadata"].nil? && !item["metadata"].empty? &&
+                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                      nodesResourceVersion = item["metadata"]["resourceVersion"]
+                      # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                    else
+                      $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                      nodesResourceVersion = nil
+                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                      break
+                    end
+                    if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name
+                      key = item["metadata"]["name"]
+                      @windowsNodeNameCacheMutex.synchronize {
+                        if !@windowsNodeNameListCache.include?(key)
+                          @windowsNodeNameListCache.push(key)
+                        end
+                      }
+                    elsif notice["type"] == "DELETED"
+                      key = item["metadata"]["name"]
+                      @windowsNodeNameCacheMutex.synchronize {
+                        @windowsNodeNameListCache.delete(key)
+                      }
+                    end
+                  when "ERROR"
                     nodesResourceVersion = nil
-                    # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                    $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
                     break
+                  else
+                    $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                   end
-                  if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name
-                    key = item["metadata"]["name"]
-                    @windowsNodeNameCacheMutex.synchronize {
-                      if !@windowsNodeNameListCache.include?(key)
-                        @windowsNodeNameListCache.push(key)
-                      end
-                    }
-                  elsif notice["type"] == "DELETED"
-                    key = item["metadata"]["name"]
-                    @windowsNodeNameCacheMutex.synchronize {
-                      @windowsNodeNameListCache.delete(key)
-                    }
-                  end
-                when "ERROR"
-                  nodesResourceVersion = nil
-                  $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                  break
-                else
-                  $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
                 end
               end
+            rescue Net::ReadTimeout => errorStr
+              ## This expected if there is no activity more than readtimeout value used in the connection
+              # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            rescue => errorStr
+              $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              nodesResourceVersion = nil
+              sleep(5) # do not overwhelm the api-server if api-server broken
+            ensure
+              watcher.finish if watcher
             end
-          rescue Net::ReadTimeout => errorStr
-            ## This expected if there is no activity more than readtimeout value used in the connection
-            # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          rescue => errorStr
-            $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            nodesResourceVersion = nil
-            sleep(5) # do not overwhelm the api-server if api-server broken
-          ensure
-            watcher.finish if watcher
           end
         rescue => errorStr
           $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")

From 351f0ff0192d6abce97ba32d8177199c669053a2 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 25 Feb 2022 00:29:55 -0800
Subject: [PATCH 43/65] fix bug

---
 source/plugins/ruby/in_kube_podinventory.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 68704c4d3..326c85895 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -758,7 +758,7 @@ def watch_pods
               end
               while (!continuationToken.nil? && !continuationToken.empty?)
                 resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}"
-                continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri)
+                continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
                 if responseCode.nil? || responseCode != "200"
                   $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
                   podsResourceVersion = nil

From 497bce47095ce8394b5f791176c6cf4f984b20e0 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Tue, 1 Mar 2022 16:08:07 -0800
Subject: [PATCH 44/65] preview image for internal customer validation

---
 charts/azuremonitor-containers/values.yaml | 4 ++--
 kubernetes/omsagent.yaml                   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml
index e15791d21..9c9a0f195 100644
--- a/charts/azuremonitor-containers/values.yaml
+++ b/charts/azuremonitor-containers/values.yaml
@@ -21,11 +21,11 @@ Azure:
 omsagent:
   image:
     repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod"
-    tag: "ciprod01312022"
+    tag: "ciprodpreview03012022"
     tagWindows: "win-ciprod01312022"
     pullPolicy: IfNotPresent
     dockerProviderVersion: "16.0.0-0"
-    agentVersion: "azure-mdsd-1..17.0"
+    agentVersion: "azure-mdsd-1.17.0"
     winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent
 
   # The priority used by the omsagent priority class for the daemonset pods
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 88ad931b1..6e6d44d51 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -357,7 +357,7 @@ spec:
         component: oms-agent
         tier: node
       annotations:
-        agentVersion: "azure-mdsd-1..17.0"
+        agentVersion: "azure-mdsd-1.17.0"
         dockerProviderVersion: "16.0.0-0"
         schema-versions: "v1"
     spec:
@@ -598,7 +598,7 @@ spec:
       labels:
         rsName: "omsagent-rs"
       annotations:
-        agentVersion: "azure-mdsd-1..17.0"
+        agentVersion: "azure-mdsd-1.17.0"
         dockerProviderVersion: "16.0.0-0"
         schema-versions: "v1"
     spec:

From 911be7eb8758d9fac970d04776a0e2ef4f49c603 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 4 Mar 2022 19:58:29 -0800
Subject: [PATCH 45/65] preview image

---
 kubernetes/linux/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile
index f3a9efd7a..ad94f001e 100644
--- a/kubernetes/linux/Dockerfile
+++ b/kubernetes/linux/Dockerfile
@@ -17,7 +17,8 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9
 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers  net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/*
 COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/
 
-ARG IMAGE_TAG=ciprod01312022
+# TODO - revert to PROD version when PR gets merge
+ARG IMAGE_TAG=ciprodpreview03012022
 ENV AGENT_VERSION ${IMAGE_TAG}
 
 WORKDIR ${tmpdir}

From cec11dd4cd15253de61152914cd4da8b605b49f9 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 6 Mar 2022 21:41:07 -0800
Subject: [PATCH 46/65] wip

---
 .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
index d39cedde0..e1b9df93a 100644
--- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
+++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
@@ -15,16 +15,16 @@ fi
 
 #Make sure that tag being pushed will not overwrite an existing tag in mcr
 MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`"
-if [ $? -ne 0 ]; then         
+if [ $? -ne 0 ]; then
    echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository"
    exit 1
 fi
 TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_RELEASE$AGENT_IMAGE_TAG_SUFFIX"'"])')
 
-if $TAG_EXISTS; then
-  echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique"
-  exit 1
-fi
+# if $TAG_EXISTS; then
+#   echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique"
+#   exit 1
+# fi
 
 if [ -z $AGENT_IMAGE_FULL_PATH ]; then
   echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables"
@@ -60,7 +60,7 @@ if [ $? -eq 0 ]; then
 else
   echo "-e error failed to login to az with managed identity credentials"
   exit 1
-fi     
+fi
 
 echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}"
 az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH

From 3d092c81be9b9070e98e5cfbd078918e7eed69ce Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 6 Mar 2022 21:45:25 -0800
Subject: [PATCH 47/65] wip

---
 .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
index e1b9df93a..c8338c01d 100644
--- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
+++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
@@ -21,10 +21,10 @@ if [ $? -ne 0 ]; then
 fi
 TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_RELEASE$AGENT_IMAGE_TAG_SUFFIX"'"])')
 
-# if $TAG_EXISTS; then
-#   echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique"
-#   exit 1
-# fi
+if $TAG_EXISTS; then
+  echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique"
+  exit 1
+fi
 
 if [ -z $AGENT_IMAGE_FULL_PATH ]; then
   echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables"

From 933f2a370b4b95acccd32932f19db1cb0d0a549c Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 4 Apr 2022 20:18:34 -0700
Subject: [PATCH 48/65] fix trailing whitespaces

---
 .../azuremonitor-containers/templates/omsagent-secret.yaml  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml
index 8c245338c..bf4e7eb3b 100644
--- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml
+++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml
@@ -15,14 +15,14 @@ data:
   DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }}
   {{- $httpsProxyDict := urlParse .Values.Azure.proxySettings.httpsProxy  -}}
   {{- $httpProxyDict := urlParse .Values.Azure.proxySettings.httpProxy  -}}
-  {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} 
+  {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }}
   PROXY: {{ .Values.Azure.proxySettings.httpsProxy | b64enc | quote }}
-  {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }}   
+  {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }}
   # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth
   PROXY: {{ urlJoin (dict "scheme" $httpsProxyDict.scheme "userinfo" "admin:secret"  "host"  $httpsProxyDict.host) | b64enc | quote }}
   {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) ($httpProxyDict.userinfo) }}
   PROXY: {{ .Values.Azure.proxySettings.httpProxy | b64enc | quote }}
-  {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} 
+  {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }}
   # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth
   PROXY: {{ urlJoin (dict "scheme" $httpProxyDict.scheme "userinfo" "admin:secret"  "host"  $httpProxyDict.host) | b64enc | quote }}
   {{- else if ne .Values.omsagent.proxy "<your_proxy_config>" }}

From 3047e7623427c2e065f331e42350c0de8f269c57 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 20 Apr 2022 21:22:36 -0700
Subject: [PATCH 49/65] fix bug

---
 source/plugins/ruby/KubernetesApiClient.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb
index 7f8cd0498..ffd76bfbd 100644
--- a/source/plugins/ruby/KubernetesApiClient.rb
+++ b/source/plugins/ruby/KubernetesApiClient.rb
@@ -822,7 +822,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil)
       resourceInventory = nil
       begin
         @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
-        responseCode, resourceInfo = getKubeResourceInfo(uri, api_group: api_group)
+        resourceInfo = getKubeResourceInfo(uri, api_group: api_group)
         @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}"
         if !resourceInfo.nil?
           @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}"

From e706feeec87c0e4a80b8cbb009bd15bac4d471d7 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 20 Apr 2022 21:49:01 -0700
Subject: [PATCH 50/65] remove unused envvars in yaml

---
 kubernetes/omsagent.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 96ea0c982..8bbdf9911 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -667,11 +667,9 @@ spec:
               valueFrom:
                 resourceFieldRef:
                   containerName: omsagent
-                  resource: limits.cpu
-            # - name: MONITORING_MAX_EVENT_RATE
-            #   value: "50000" # default 20KPS for MDSD, for large cluster validate 50KPS
+                  resource: limits.cpu           
             - name: EMIT_CACHE_TELEMETRY
-              value: "true" # enable only debug or test purpose and disable for prod
+              value: "false" # enable only debug or test purpose and disable for prod
             - name: AKS_RESOURCE_ID
               value: "VALUE_AKS_RESOURCE_ID_VALUE"
             - name: AKS_REGION

From 1ac6672753beaaa395ce8c4575377f154cbcd9f1 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 21 Apr 2022 17:23:23 -0700
Subject: [PATCH 51/65] revert minor things

---
 .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh              | 4 ++--
 kubernetes/omsagent.yaml                                    | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
index de306b50a..25eb43f47 100644
--- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
+++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh
@@ -15,7 +15,7 @@ fi
 
 #Make sure that tag being pushed will not overwrite an existing tag in mcr
 MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`"
-if [ $? -ne 0 ]; then
+if [ $? -ne 0 ]; then         
    echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository"
    exit 1
 fi
@@ -67,7 +67,7 @@ if [ $? -eq 0 ]; then
 else
   echo "-e error failed to login to az with managed identity credentials"
   exit 1
-fi
+fi     
 
 echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}"
 az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH
diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml
index 8bbdf9911..8cbd7412b 100644
--- a/kubernetes/omsagent.yaml
+++ b/kubernetes/omsagent.yaml
@@ -657,8 +657,8 @@ spec:
           imagePullPolicy: IfNotPresent
           resources:
             limits:
-              cpu: 5
-              memory: 5Gi
+              cpu: 1
+              memory: 1Gi
             requests:
               cpu: 150m
               memory: 250Mi
@@ -667,7 +667,7 @@ spec:
               valueFrom:
                 resourceFieldRef:
                   containerName: omsagent
-                  resource: limits.cpu           
+                  resource: limits.cpu
             - name: EMIT_CACHE_TELEMETRY
               value: "false" # enable only debug or test purpose and disable for prod
             - name: AKS_RESOURCE_ID

From 4bb069e5c666bb8588a031e8794a29c8e3f51e90 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 21 Apr 2022 18:17:53 -0700
Subject: [PATCH 52/65] telemetry tags for preview release

---
 kubernetes/linux/Dockerfile   | 2 +-
 kubernetes/windows/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile
index becbe1157..3044f0aa2 100644
--- a/kubernetes/linux/Dockerfile
+++ b/kubernetes/linux/Dockerfile
@@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9
 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers  net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/*
 COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/
 
-ARG IMAGE_TAG=ciprod03172022
+ARG IMAGE_TAG=ciprodpreview04222022
 ENV AGENT_VERSION ${IMAGE_TAG}
 
 WORKDIR ${tmpdir}
diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile
index 7c514a777..87e7454c0 100644
--- a/kubernetes/windows/Dockerfile
+++ b/kubernetes/windows/Dockerfile
@@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com
 LABEL vendor=Microsoft\ Corp \
     com.microsoft.product="Azure Monitor for containers"
 
-ARG IMAGE_TAG=win-ciprod03172022
+ARG IMAGE_TAG=win-ciprodpreview04222022
 
 # Do not split this into multiple RUN!
 # Docker creates a layer for every RUN-Statement

From 1262c8a84b16c7a4bab3a5c3df1aaa8acdf60399 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 28 Apr 2022 19:48:05 -0700
Subject: [PATCH 53/65] revert preview image tags

---
 kubernetes/linux/Dockerfile   | 2 +-
 kubernetes/windows/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile
index 3044f0aa2..becbe1157 100644
--- a/kubernetes/linux/Dockerfile
+++ b/kubernetes/linux/Dockerfile
@@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9
 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers  net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/*
 COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/
 
-ARG IMAGE_TAG=ciprodpreview04222022
+ARG IMAGE_TAG=ciprod03172022
 ENV AGENT_VERSION ${IMAGE_TAG}
 
 WORKDIR ${tmpdir}
diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile
index 87e7454c0..7c514a777 100644
--- a/kubernetes/windows/Dockerfile
+++ b/kubernetes/windows/Dockerfile
@@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com
 LABEL vendor=Microsoft\ Corp \
     com.microsoft.product="Azure Monitor for containers"
 
-ARG IMAGE_TAG=win-ciprodpreview04222022
+ARG IMAGE_TAG=win-ciprod03172022
 
 # Do not split this into multiple RUN!
 # Docker creates a layer for every RUN-Statement

From 5a67c0c0ee429de301b6b260812ec18b354d85dc Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 1 May 2022 08:44:15 -0700
Subject: [PATCH 54/65] revert unintended change

---
 .../ruby/kubernetes_container_inventory.rb    | 63 +++++++++----------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb
index ffe92ec40..82e36c8cc 100644
--- a/source/plugins/ruby/kubernetes_container_inventory.rb
+++ b/source/plugins/ruby/kubernetes_container_inventory.rb
@@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !atLocation.nil?
                 containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1]
               end
-            end
+            end            
             containerInventoryRecord["ExitCode"] = 0
             isContainerTerminated = false
             isContainerWaiting = false
@@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
             end
 
             containerInfoMap = containersInfoMap[containerName]
-            # image can be in any one of below format in spec
-            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image
+            # image can be in any one of below format in spec 
+            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image                       
             imageValue = containerInfoMap["image"]
             if !imageValue.nil? && !imageValue.empty?
               # Find delimiters in image format
               atLocation = imageValue.index("@")
-              isDigestSpecified = false
+              isDigestSpecified = false 
               if !atLocation.nil?
                 # repository/image@digest or repository/image:imagetag@digest, image@digest
                 imageValue = imageValue[0..(atLocation - 1)]
                 # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc.
                 if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty?
-                  containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1]
+                   containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] 
                 end
                 isDigestSpecified = true
               end
@@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !colonLocation.nil?
                 if slashLocation.nil?
                   # image:imagetag
-                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
+                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]                 
                 else
                   # repository/image:imagetag
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)]
                 end
                 containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1]
-              else
+              else 
                 if slashLocation.nil?
                   # image
                   containerInventoryRecord["Image"] = imageValue
@@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
                   # repo/image
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1]
-                end
+                end 
                 # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status.
                 # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names
-                if isDigestSpecified == false
+                if isDigestSpecified == false 
                   containerInventoryRecord["ImageTag"] = "latest"
                 end
-              end
+              end           
             end
-
+           
             podName = containerInfoMap["PodName"]
             namespace = containerInfoMap["Namespace"]
             # containername in the format what docker sees
@@ -199,11 +199,7 @@ def getContainersInfoMap(podItem, isWindows)
               cmdValue = container["command"]
               cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s
               containerInfoMap["Command"] = cmdValueString
-              if isWindows
-                containerInfoMap["EnvironmentVar"] = container["env"]
-              else
-                containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
-              end
+              containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
               containersInfoMap[containerName] = containerInfoMap
             end
           end
@@ -216,47 +212,47 @@ def getContainersInfoMap(podItem, isWindows)
       return containersInfoMap
     end
 
-    def obtainContainerEnvironmentVars(containerId)
+    def obtainContainerEnvironmentVars(containerId)    
       envValueString = ""
       begin
-        isCGroupPidFetchRequired = false
+        isCGroupPidFetchRequired = false 
         if !@@containerCGroupCache.has_key?(containerId)
-          isCGroupPidFetchRequired = true
+          isCGroupPidFetchRequired = true 
         else
           cGroupPid = @@containerCGroupCache[containerId]
-          if cGroupPid.nil? || cGroupPid.empty?
+          if cGroupPid.nil? || cGroupPid.empty?            
             isCGroupPidFetchRequired = true
             @@containerCGroupCache.delete(containerId)
-          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
+          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")              
             isCGroupPidFetchRequired = true
-            @@containerCGroupCache.delete(containerId)
-          end
+            @@containerCGroupCache.delete(containerId)                       
+          end        
         end
 
-        if isCGroupPidFetchRequired
+        if isCGroupPidFetchRequired         
           Dir["/hostfs/proc/*/cgroup"].each do |filename|
             begin
               if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any?
                 # file full path is /hostfs/proc/<cGroupPid>/cgroup
-                cGroupPid = filename.split("/")[3]
-                if is_number?(cGroupPid)
+                cGroupPid = filename.split("/")[3]  
+                if is_number?(cGroupPid)                              
                   if @@containerCGroupCache.has_key?(containerId)
-                    tempCGroupPid = @@containerCGroupCache[containerId]
+                    tempCGroupPid = @@containerCGroupCache[containerId]                  
                     if tempCGroupPid.to_i > cGroupPid.to_i
                       @@containerCGroupCache[containerId] = cGroupPid
                     end
                   else
                     @@containerCGroupCache[containerId] = cGroupPid
-                  end
+                  end                        
                 end
               end
-            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
-            end
-          end
+            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read              
+            end          
+          end        
         end
         cGroupPid = @@containerCGroupCache[containerId]
         if !cGroupPid.nil? && !cGroupPid.empty?
-          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
+          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"       
           if File.exist?(environFilePath)
             # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
             # Check to see if the environment variable collection is disabled for this container.
@@ -269,7 +265,7 @@ def obtainContainerEnvironmentVars(containerId)
               if !envVars.nil? && !envVars.empty?
                 envVars = envVars.split("\0")
                 envValueString = envVars.to_json
-                envValueStringLength = envValueString.length
+                envValueStringLength = envValueString.length              
                 if envValueStringLength >= 200000
                   lastIndex = envValueString.rindex("\",")
                   if !lastIndex.nil?
@@ -380,7 +376,6 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId)
         ApplicationInsightsUtility.sendExceptionTelemetry(error)
       end
     end
-
     def is_number?(value)
       true if Integer(value) rescue false
     end

From 3f2e05f24ab342c26d4623255a9bb9e6ea362eb5 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 11 May 2022 10:57:31 -0700
Subject: [PATCH 55/65] fix bug

---
 .../ruby/kubernetes_container_inventory.rb    | 64 ++++++++++---------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb
index 82e36c8cc..81889b61b 100644
--- a/source/plugins/ruby/kubernetes_container_inventory.rb
+++ b/source/plugins/ruby/kubernetes_container_inventory.rb
@@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !atLocation.nil?
                 containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1]
               end
-            end            
+            end
             containerInventoryRecord["ExitCode"] = 0
             isContainerTerminated = false
             isContainerWaiting = false
@@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
             end
 
             containerInfoMap = containersInfoMap[containerName]
-            # image can be in any one of below format in spec 
-            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image                       
+            # image can be in any one of below format in spec
+            # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image
             imageValue = containerInfoMap["image"]
             if !imageValue.nil? && !imageValue.empty?
               # Find delimiters in image format
               atLocation = imageValue.index("@")
-              isDigestSpecified = false 
+              isDigestSpecified = false
               if !atLocation.nil?
                 # repository/image@digest or repository/image:imagetag@digest, image@digest
                 imageValue = imageValue[0..(atLocation - 1)]
                 # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc.
                 if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty?
-                   containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] 
+                  containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1]
                 end
                 isDigestSpecified = true
               end
@@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
               if !colonLocation.nil?
                 if slashLocation.nil?
                   # image:imagetag
-                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]                 
+                  containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)]
                 else
                   # repository/image:imagetag
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)]
                 end
                 containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1]
-              else 
+              else
                 if slashLocation.nil?
                   # image
                   containerInventoryRecord["Image"] = imageValue
@@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa
                   # repo/image
                   containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)]
                   containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1]
-                end 
+                end
                 # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status.
                 # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names
-                if isDigestSpecified == false 
+                if isDigestSpecified == false
                   containerInventoryRecord["ImageTag"] = "latest"
                 end
-              end           
+              end
             end
-           
+
             podName = containerInfoMap["PodName"]
             namespace = containerInfoMap["Namespace"]
             # containername in the format what docker sees
@@ -199,7 +199,12 @@ def getContainersInfoMap(podItem, isWindows)
               cmdValue = container["command"]
               cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s
               containerInfoMap["Command"] = cmdValueString
-              containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+              if isWindows
+                # For windows container inventory, we dont need to get envvars from pods response since its already taken care in KPI as part of pod optimized item
+                containerInfoMap["EnvironmentVar"] = container["env"]
+              else
+                containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container)
+              end
               containersInfoMap[containerName] = containerInfoMap
             end
           end
@@ -212,47 +217,47 @@ def getContainersInfoMap(podItem, isWindows)
       return containersInfoMap
     end
 
-    def obtainContainerEnvironmentVars(containerId)    
+    def obtainContainerEnvironmentVars(containerId)
       envValueString = ""
       begin
-        isCGroupPidFetchRequired = false 
+        isCGroupPidFetchRequired = false
         if !@@containerCGroupCache.has_key?(containerId)
-          isCGroupPidFetchRequired = true 
+          isCGroupPidFetchRequired = true
         else
           cGroupPid = @@containerCGroupCache[containerId]
-          if cGroupPid.nil? || cGroupPid.empty?            
+          if cGroupPid.nil? || cGroupPid.empty?
             isCGroupPidFetchRequired = true
             @@containerCGroupCache.delete(containerId)
-          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")              
+          elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ")
             isCGroupPidFetchRequired = true
-            @@containerCGroupCache.delete(containerId)                       
-          end        
+            @@containerCGroupCache.delete(containerId)
+          end
         end
 
-        if isCGroupPidFetchRequired         
+        if isCGroupPidFetchRequired
           Dir["/hostfs/proc/*/cgroup"].each do |filename|
             begin
               if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any?
                 # file full path is /hostfs/proc/<cGroupPid>/cgroup
-                cGroupPid = filename.split("/")[3]  
-                if is_number?(cGroupPid)                              
+                cGroupPid = filename.split("/")[3]
+                if is_number?(cGroupPid)
                   if @@containerCGroupCache.has_key?(containerId)
-                    tempCGroupPid = @@containerCGroupCache[containerId]                  
+                    tempCGroupPid = @@containerCGroupCache[containerId]
                     if tempCGroupPid.to_i > cGroupPid.to_i
                       @@containerCGroupCache[containerId] = cGroupPid
                     end
                   else
                     @@containerCGroupCache[containerId] = cGroupPid
-                  end                        
+                  end
                 end
               end
-            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read              
-            end          
-          end        
+            rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read
+            end
+          end
         end
         cGroupPid = @@containerCGroupCache[containerId]
         if !cGroupPid.nil? && !cGroupPid.empty?
-          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"       
+          environFilePath = "/hostfs/proc/#{cGroupPid}/environ"
           if File.exist?(environFilePath)
             # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE
             # Check to see if the environment variable collection is disabled for this container.
@@ -265,7 +270,7 @@ def obtainContainerEnvironmentVars(containerId)
               if !envVars.nil? && !envVars.empty?
                 envVars = envVars.split("\0")
                 envValueString = envVars.to_json
-                envValueStringLength = envValueString.length              
+                envValueStringLength = envValueString.length
                 if envValueStringLength >= 200000
                   lastIndex = envValueString.rindex("\",")
                   if !lastIndex.nil?
@@ -376,6 +381,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId)
         ApplicationInsightsUtility.sendExceptionTelemetry(error)
       end
     end
+
     def is_number?(value)
       true if Integer(value) rescue false
     end

From 7dad848ae8aae4d21614bcb05fe665f0155b3ad6 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Thu, 12 May 2022 23:08:40 -0700
Subject: [PATCH 56/65] use same batchtime for both mdm & podinventory records

---
 source/plugins/ruby/in_kube_podinventory.rb   | 17 ++++++++++------
 .../plugins/ruby/in_kube_podmdminventory.rb   | 20 +++++++++----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 326c85895..2fbdb074c 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -148,7 +148,7 @@ def enumerate(podList = nil)
         batchTime = currentTime.utc.iso8601
         serviceRecords = []
         @podInventoryE2EProcessingLatencyMs = 0
-        @mdmPodRecords = []
+        @mdmPodRecordItems = []
         podInventoryStartTime = (Time.now.to_f * 1000).to_i
         if ExtensionUtils.isAADMSIAuthMode()
           $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE")
@@ -211,7 +211,7 @@ def enumerate(podList = nil)
         # Setting these to nil so that we dont hold memory until GC kicks in
         podInventory = nil
         serviceRecords = nil
-        @mdmPodRecords = nil
+        @mdmPodRecordItems = nil
 
         # Adding telemetry to send pod telemetry every 5 minutes
         timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs
@@ -351,11 +351,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
         if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send
           if CustomMetricsUtils.check_custom_metrics_availability
             begin
-              if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0
+              if !@mdmPodRecordItems.nil? && @mdmPodRecordItems.length > 0
+                mdmPodRecords = {
+                  "collectionTime": batchTime,
+                  "items": @mdmPodRecordItems,
+                }
                 mdmPodRecordsJson = @mdmPodRecords.to_json
                 @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
                 @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
                 writeMDMRecords(mdmPodRecordsJson)
+                mdmPodRecords = nil
                 @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
               end
             rescue => err
@@ -647,7 +652,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601)
           records.push(record)
         end  #container status block end
 
-        @mdmPodRecords.push(mdmPodRecord.dup)
+        @mdmPodRecordItems.push(mdmPodRecord.dup)
 
         records.each do |record|
           if !record.nil?
@@ -1175,11 +1180,11 @@ def writeMDMRecords(mdmRecordsJson)
           raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write"
         end
       rescue => err
-        if retryAttemptCount < maxRetryCount
+        if retryAttemptCount <= maxRetryCount
           f.flock(File::LOCK_UN) if !f.nil?
           f.close if !f.nil?
-          retryAttemptCount = retryAttemptCount + 1
           sleep (initialRetryDelaySecs * retryAttemptCount)
+          retryAttemptCount = retryAttemptCount + 1
           retry
         end
         $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 8272420c3..40a5c73d6 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -62,9 +62,7 @@ def enumerate
         if !@isCustomMetricsAvailability
           $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region"
         else
-          currentTime = Time.now
-          batchTime = currentTime.utc.iso8601
-          parse_and_emit_records(batchTime)
+          parse_and_emit_records()
         end
       rescue => errorStr
         $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}"
@@ -73,13 +71,15 @@ def enumerate
       end
     end
 
-    def parse_and_emit_records(batchTime = Time.utc.iso8601)
+    def parse_and_emit_records()
       begin
         $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}"
         mdmPodRecords = getMDMRecords()
-        $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
-        if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0
-          mdmPodRecords.each do |record|
+        mdmPodRecordItems =
+          $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
+        if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0
+          batchTime = mdmPodRecords["collectionTime"] # This is time KubePODinventory plugin collected
+          mdmPodRecords["items"].each do |record|
             @inventoryToMdmConvertor.process_pod_inventory_record(record)
             @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])
             containerRecords = record["containerRecords"]
@@ -180,7 +180,7 @@ def getMDMRecords()
       maxRetryCount = 3
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
-      mdmRecords = []
+      mdmRecords = {}
       begin
         f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r")
         if !f.nil?
@@ -194,11 +194,11 @@ def getMDMRecords()
           raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read"
         end
       rescue => err
-        if retryAttemptCount < maxRetryCount
+        if retryAttemptCount <= maxRetryCount
           f.flock(File::LOCK_UN) if !f.nil?
           f.close if !f.nil?
-          retryAttemptCount = retryAttemptCount + 1
           sleep (initialRetryDelaySecs * retryAttemptCount)
+          retryAttemptCount = retryAttemptCount + 1
           retry
         end
         $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"

From f49dffdf4f00bbe12957e29a08d578c896bb02d3 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 13 May 2022 10:07:19 -0700
Subject: [PATCH 57/65] use same batchtime for both mdm & podinventory records

---
 source/plugins/ruby/in_kube_podinventory.rb    | 3 ++-
 source/plugins/ruby/in_kube_podmdminventory.rb | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index 2fbdb074c..b84b53d28 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -356,11 +356,12 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc
                   "collectionTime": batchTime,
                   "items": @mdmPodRecordItems,
                 }
-                mdmPodRecordsJson = @mdmPodRecords.to_json
+                mdmPodRecordsJson = mdmPodRecords.to_json
                 @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}"
                 @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}"
                 writeMDMRecords(mdmPodRecordsJson)
                 mdmPodRecords = nil
+                mdmPodRecordsJson = nil
                 @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}"
               end
             rescue => err
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 40a5c73d6..5f5aff714 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -78,7 +78,7 @@ def parse_and_emit_records()
         mdmPodRecordItems =
           $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
         if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0
-          batchTime = mdmPodRecords["collectionTime"] # This is time KubePODinventory plugin collected
+          batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory
           mdmPodRecords["items"].each do |record|
             @inventoryToMdmConvertor.process_pod_inventory_record(record)
             @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"])

From f4824b297dd96de7b47a31ca250093cd1b014cf3 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 14 May 2022 16:39:55 -0700
Subject: [PATCH 58/65] use same batchtime for both mdm & podinventory records

---
 source/plugins/ruby/in_kube_podmdminventory.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 5f5aff714..a7d8c4765 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -189,7 +189,7 @@ def getMDMRecords()
           startTime = (Time.now.to_f * 1000).to_i
           mdmRecords = Yajl::Parser.parse(f)
           timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
-          $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
+          $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
         else
           raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read"
         end

From ab3b042e4ba22a9e78b0d0631f89717720beb1a8 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sun, 15 May 2022 13:56:15 -0700
Subject: [PATCH 59/65] use same batchtime for both mdm & podinventory records

---
 source/plugins/ruby/in_kube_podmdminventory.rb | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index a7d8c4765..5be9bc99c 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -40,6 +40,7 @@ def start
         $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}")
         @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability
         @finished = false
+        @prevCollectionTime = nil
         @condition = ConditionVariable.new
         @mutex = Mutex.new
         @thread = Thread.new(&method(:run_periodic))
@@ -177,7 +178,7 @@ def run_periodic
     end
 
     def getMDMRecords()
-      maxRetryCount = 3
+      maxRetryCount = 5
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
       mdmRecords = {}
@@ -189,6 +190,10 @@ def getMDMRecords()
           startTime = (Time.now.to_f * 1000).to_i
           mdmRecords = Yajl::Parser.parse(f)
           timetakenMs = ((Time.now.to_f * 1000).to_i - startTime)
+          if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime
+            raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale"
+          end
+          @prevCollectionTime = mdmRecords["collectionTime"]
           $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @  #{Time.now.utc.iso8601}"
         else
           raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read"

From e39a120144022c4fcd4dba5f4c5e49ff6a394466 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 20 May 2022 10:52:03 -0700
Subject: [PATCH 60/65] preview image tag with latest ci_dev changes

---
 kubernetes/linux/Dockerfile.multiarch | 2 +-
 kubernetes/windows/Dockerfile         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch
index fd0330d5d..133f40178 100644
--- a/kubernetes/linux/Dockerfile.multiarch
+++ b/kubernetes/linux/Dockerfile.multiarch
@@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl
 COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/
 COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/
 
-ARG IMAGE_TAG=ciprod05192022
+ARG IMAGE_TAG=ciprodpreview05202022
 ENV AGENT_VERSION ${IMAGE_TAG}
 
 WORKDIR ${tmpdir}
diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile
index 383652e0e..e74d05e96 100644
--- a/kubernetes/windows/Dockerfile
+++ b/kubernetes/windows/Dockerfile
@@ -5,7 +5,7 @@ MAINTAINER OMSContainers@microsoft.com
 LABEL vendor=Microsoft\ Corp \
     com.microsoft.product="Azure Monitor for containers"
 
-ARG IMAGE_TAG=win-ciprod05192022
+ARG IMAGE_TAG=win-ciprodpreview05202022
 
 # Do not split this into multiple RUN!
 # Docker creates a layer for every RUN-Statement

From cae999b7a1e2b1385b53a3aac4d77d6bbf5b7660 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Wed, 8 Jun 2022 19:43:34 -0700
Subject: [PATCH 61/65] change back to use prod image in docker files

---
 kubernetes/linux/Dockerfile.multiarch | 2 +-
 kubernetes/windows/Dockerfile         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch
index 133f40178..fd0330d5d 100644
--- a/kubernetes/linux/Dockerfile.multiarch
+++ b/kubernetes/linux/Dockerfile.multiarch
@@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl
 COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/
 COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/
 
-ARG IMAGE_TAG=ciprodpreview05202022
+ARG IMAGE_TAG=ciprod05192022
 ENV AGENT_VERSION ${IMAGE_TAG}
 
 WORKDIR ${tmpdir}
diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile
index e74d05e96..383652e0e 100644
--- a/kubernetes/windows/Dockerfile
+++ b/kubernetes/windows/Dockerfile
@@ -5,7 +5,7 @@ MAINTAINER OMSContainers@microsoft.com
 LABEL vendor=Microsoft\ Corp \
     com.microsoft.product="Azure Monitor for containers"
 
-ARG IMAGE_TAG=win-ciprodpreview05202022
+ARG IMAGE_TAG=win-ciprod05192022
 
 # Do not split this into multiple RUN!
 # Docker creates a layer for every RUN-Statement

From b4e5427d5df5f5632446546405a7e2a3c8565564 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Fri, 10 Jun 2022 18:58:05 -0700
Subject: [PATCH 62/65] fix unit test failures

---
 kubernetes/linux/main.sh                  |   4 +-
 source/plugins/ruby/in_kube_nodes.rb      | 274 ++++++++++++----------
 source/plugins/ruby/in_kube_nodes_test.rb | 118 +++++-----
 3 files changed, 210 insertions(+), 186 deletions(-)

diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh
index c45ef6024..1e00457d9 100644
--- a/kubernetes/linux/main.sh
+++ b/kubernetes/linux/main.sh
@@ -95,8 +95,8 @@ setReplicaSetSpecificConfig() {
       export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20"
       export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default
       case $NUM_OF_FLUENTD_WORKERS in
-      5)
-            export NUM_OF_FLUENTD_WORKERS=5
+      [5-9]|9[0-9]|100)
+            export NUM_OF_FLUENTD_WORKERS=5  # Max is 5 core even if the specified limits more than 5 cores
             export FLUENTD_POD_INVENTORY_WORKER_ID=4
             export FLUENTD_NODE_INVENTORY_WORKER_ID=3
             export FLUENTD_EVENT_INVENTORY_WORKER_ID=2
diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 8a017243c..690a1ca8c 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -7,11 +7,12 @@ module Fluent::Plugin
   class Kube_nodeInventory_Input < Input
     Fluent::Plugin.register_input("kube_nodes", self)
 
-    def initialize(kubernetesApiClient = nil,
+    def initialize(is_unit_test_mode = nil, kubernetesApiClient = nil,
                    applicationInsightsUtility = nil,
                    extensionUtils = nil,
                    env = nil,
-                   telemetry_flush_interval = nil)
+                   telemetry_flush_interval = nil,
+                   node_items_test_cache = nil)
       super()
 
       require "yaml"
@@ -30,6 +31,8 @@ def initialize(kubernetesApiClient = nil,
       @extensionUtils = extensionUtils == nil ? ExtensionUtils : extensionUtils
       @env = env == nil ? ENV : env
       @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = telemetry_flush_interval == nil ? Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES : telemetry_flush_interval
+      @is_unit_test_mode = is_unit_test_mode == nil ? false : true
+      @node_items_test_cache = node_items_test_cache
 
       # these defines were previously at class scope Moving them into the constructor so that they can be set by unit tests
       @@configMapMountPath = "/etc/config/settings/log-data-collection-settings"
@@ -65,6 +68,7 @@ def initialize(kubernetesApiClient = nil,
       @NodeCache = NodeStatsCache.new()
       @watchNodesThread = nil
       @nodeItemsCache = {}
+      @nodeItemsCacheSizeKB = 0
     end
 
     config_param :run_interval, :time, :default => 60
@@ -153,14 +157,9 @@ def enumerate
         # Initializing continuation token to nil
         continuationToken = nil
         nodeInventory = {}
-        nodeItemsCacheSizeKB = 0
+        @nodeItemsCacheSizeKB = 0
         nodeCount = 0
-        @nodeCacheMutex.synchronize {
-          nodeInventory["items"] = @nodeItemsCache.values.clone
-          if KubernetesApiClient.isEmitCacheTelemetry()
-            nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
-          end
-        }
+        nodeInventory["items"] = getNodeItemsFromCache()
         nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i
         @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime)
         if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
@@ -178,7 +177,7 @@ def enumerate
           @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {})
           telemetryProperties = {}
           if KubernetesApiClient.isEmitCacheTelemetry()
-            telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB
+            telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB
           end
           ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties)
           @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i
@@ -596,58 +595,110 @@ def getNodeTelemetryProps(item)
     end
 
     def watch_nodes
-      $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}")
-      nodesResourceVersion = nil
-      loop do
-        begin
-          if nodesResourceVersion.nil?
-            # clear cache before filling the cache with list
-            @nodeCacheMutex.synchronize {
-              @nodeItemsCache.clear()
-            }
-            continuationToken = nil
-            resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
-            $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
-            continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
-            if responseCode.nil? || responseCode != "200"
-              $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
-            else
-              $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
-              if (!nodeInventory.nil? && !nodeInventory.empty?)
-                nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-                if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                  $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
-                  nodeInventory["items"].each do |item|
-                    key = item["metadata"]["uid"]
-                    if !key.nil? && !key.empty?
-                      nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                      if !nodeItem.nil? && !nodeItem.empty?
-                        @nodeCacheMutex.synchronize {
-                          @nodeItemsCache[key] = nodeItem
-                        }
+      if !@is_unit_test_mode
+        $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}")
+        nodesResourceVersion = nil
+        loop do
+          begin
+            if nodesResourceVersion.nil?
+              # clear cache before filling the cache with list
+              @nodeCacheMutex.synchronize {
+                @nodeItemsCache.clear()
+              }
+              continuationToken = nil
+              resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}")
+              $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+              continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri)
+              if responseCode.nil? || responseCode != "200"
+                $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+              else
+                $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}")
+                if (!nodeInventory.nil? && !nodeInventory.empty?)
+                  nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                  if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                    $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length}  from Kube API @ #{Time.now.utc.iso8601}")
+                    nodeInventory["items"].each do |item|
+                      key = item["metadata"]["uid"]
+                      if !key.nil? && !key.empty?
+                        nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                        if !nodeItem.nil? && !nodeItem.empty?
+                          @nodeCacheMutex.synchronize {
+                            @nodeItemsCache[key] = nodeItem
+                          }
+                        else
+                          $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                        end
                       else
-                        $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                        $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
+                      end
+                    end
+                  end
+                else
+                  $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
+                end
+                while (!continuationToken.nil? && !continuationToken.empty?)
+                  continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
+                  if responseCode.nil? || responseCode != "200"
+                    $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
+                    nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
+                    break
+                  else
+                    if (!nodeInventory.nil? && !nodeInventory.empty?)
+                      nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
+                      if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
+                        $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
+                        nodeInventory["items"].each do |item|
+                          key = item["metadata"]["uid"]
+                          if !key.nil? && !key.empty?
+                            nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
+                            if !nodeItem.nil? && !nodeItem.empty?
+                              @nodeCacheMutex.synchronize {
+                                @nodeItemsCache[key] = nodeItem
+                              }
+                            else
+                              $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
+                            end
+                          else
+                            $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
+                          end
+                        end
                       end
                     else
-                      $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
+                      $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
                     end
                   end
                 end
-              else
-                $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}"
               end
-              while (!continuationToken.nil? && !continuationToken.empty?)
-                continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}")
-                if responseCode.nil? || responseCode != "200"
-                  $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}")
-                  nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again
-                  break
+            end
+            if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
+              # https://github.com/kubernetes/kubernetes/issues/74022
+              $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
+              nodesResourceVersion = nil # for the LIST to happen again
+              sleep(30) # do not overwhelm the api-server if api-server broken
+            else
+              begin
+                $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
+                if watcher.nil?
+                  $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
                 else
-                  if (!nodeInventory.nil? && !nodeInventory.empty?)
-                    nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"]
-                    if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?)
-                      $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}")
-                      nodeInventory["items"].each do |item|
+                  watcher.each do |notice|
+                    case notice["type"]
+                    when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
+                      item = notice["object"]
+                      # extract latest resource version to use for watch reconnect
+                      if !item.nil? && !item.empty? &&
+                         !item["metadata"].nil? && !item["metadata"].empty? &&
+                         !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
+                        nodesResourceVersion = item["metadata"]["resourceVersion"]
+                        # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
+                      else
+                        $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
+                        nodesResourceVersion = nil
+                        # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
+                        break
+                      end
+                      if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
                         key = item["metadata"]["uid"]
                         if !key.nil? && !key.empty?
                           nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
@@ -661,93 +712,43 @@ def watch_nodes
                         else
                           $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
                         end
-                      end
-                    end
-                  else
-                    $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory  @ #{Time.now.utc.iso8601}"
-                  end
-                end
-              end
-            end
-          end
-          if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0"
-            # https://github.com/kubernetes/kubernetes/issues/74022
-            $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}")
-            nodesResourceVersion = nil # for the LIST to happen again
-            sleep(30) # do not overwhelm the api-server if api-server broken
-          else
-            begin
-              $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-              watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true)
-              if watcher.nil?
-                $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-              else
-                watcher.each do |notice|
-                  case notice["type"]
-                  when "ADDED", "MODIFIED", "DELETED", "BOOKMARK"
-                    item = notice["object"]
-                    # extract latest resource version to use for watch reconnect
-                    if !item.nil? && !item.empty? &&
-                       !item["metadata"].nil? && !item["metadata"].empty? &&
-                       !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty?
-                      nodesResourceVersion = item["metadata"]["resourceVersion"]
-                      # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}")
-                    else
-                      $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}")
-                      nodesResourceVersion = nil
-                      # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
-                      break
-                    end
-                    if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED"))
-                      key = item["metadata"]["uid"]
-                      if !key.nil? && !key.empty?
-                        nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item)
-                        if !nodeItem.nil? && !nodeItem.empty?
+                      elsif notice["type"] == "DELETED"
+                        key = item["metadata"]["uid"]
+                        if !key.nil? && !key.empty?
                           @nodeCacheMutex.synchronize {
-                            @nodeItemsCache[key] = nodeItem
+                            @nodeItemsCache.delete(key)
                           }
-                        else
-                          $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty  @ #{Time.now.utc.iso8601}"
                         end
-                      else
-                        $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty  @ #{Time.now.utc.iso8601}"
-                      end
-                    elsif notice["type"] == "DELETED"
-                      key = item["metadata"]["uid"]
-                      if !key.nil? && !key.empty?
-                        @nodeCacheMutex.synchronize {
-                          @nodeItemsCache.delete(key)
-                        }
                       end
+                    when "ERROR"
+                      nodesResourceVersion = nil
+                      $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
+                      break
+                    else
+                      nodesResourceVersion = nil
+                      $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
+                      break
                     end
-                  when "ERROR"
-                    nodesResourceVersion = nil
-                    $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}")
-                    break
-                  else
-                    nodesResourceVersion = nil
-                    $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}")
-                    break
                   end
                 end
+              rescue Net::ReadTimeout => errorStr
+                ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
+                # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+              rescue => errorStr
+                $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+                nodesResourceVersion = nil
+                sleep(5) # do not overwhelm the api-server if api-server broken
+              ensure
+                watcher.finish if watcher
               end
-            rescue Net::ReadTimeout => errorStr
-              ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection
-              # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-            rescue => errorStr
-              $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-              nodesResourceVersion = nil
-              sleep(5) # do not overwhelm the api-server if api-server broken
-            ensure
-              watcher.finish if watcher
             end
+          rescue => errorStr
+            $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
+            nodesResourceVersion = nil
           end
-        rescue => errorStr
-          $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}")
-          nodesResourceVersion = nil
         end
+        $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}")
       end
-      $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}")
     end
 
     def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson)
@@ -782,6 +783,21 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson)
         f.close if !f.nil?
       end
     end
+
+    def getNodeItemsFromCache()
+      nodeItems = {}
+      if @is_unit_test_mode
+        nodeItems = @node_items_test_cache
+      else
+        @nodeCacheMutex.synchronize {
+          nodeItems = @nodeItemsCache.values.clone
+          if KubernetesApiClient.isEmitCacheTelemetry()
+            @nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024
+          end
+        }
+      end
+      return nodeItems
+    end
   end # Kube_Node_Input
 
   class NodeStatsCache
diff --git a/source/plugins/ruby/in_kube_nodes_test.rb b/source/plugins/ruby/in_kube_nodes_test.rb
index 8f4984c6c..7d55ea32d 100644
--- a/source/plugins/ruby/in_kube_nodes_test.rb
+++ b/source/plugins/ruby/in_kube_nodes_test.rb
@@ -1,10 +1,10 @@
-require 'minitest/autorun'
+require "minitest/autorun"
 
-require 'fluent/test'
-require 'fluent/test/driver/input'
-require 'fluent/test/helpers'
+require "fluent/test"
+require "fluent/test/driver/input"
+require "fluent/test/helpers"
 
-require_relative 'in_kube_nodes.rb'
+require_relative "in_kube_nodes.rb"
 
 class InKubeNodesTests < Minitest::Test
   include Fluent::Test::Helpers
@@ -13,20 +13,22 @@ def setup
     Fluent::Test.setup
   end
 
-  def create_driver(conf = {}, kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil, telemetry_flush_interval=nil)
-    Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(kubernetesApiClient=kubernetesApiClient,
-                                                                             applicationInsightsUtility=applicationInsightsUtility,
-                                                                             extensionUtils=extensionUtils,
-                                                                             env=env)).configure(conf)
+  def create_driver(conf = {}, is_unit_test_mode = true, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, telemetry_flush_interval = nil, node_items_test_cache)
+    Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(is_unit_test_mode, kubernetesApiClient = kubernetesApiClient,
+                                                                                 applicationInsightsUtility = applicationInsightsUtility,
+                                                                                 extensionUtils = extensionUtils,
+                                                                                 env = env,
+                                                                                 telemetry_flush_interval,
+                                                                                 node_items_test_cache)).configure(conf)
   end
 
   # Collection time of scrapped data will always be different. Overwrite it in any records returned by in_kube_ndes.rb
   def overwrite_collection_time(data)
     if data.key?("CollectionTime")
-        data["CollectionTime"] = "~CollectionTime~"
+      data["CollectionTime"] = "~CollectionTime~"
     end
     if data.key?("Timestamp")
-        data["Timestamp"] = "~Timestamp~"
+      data["Timestamp"] = "~Timestamp~"
     end
     return data
   end
@@ -45,41 +47,46 @@ def test_basic_single_node
     # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking
     # but it doesn't track how many times isAADMSIAuthMode is called
     def extensionUtils.isAADMSIAuthMode
-        false
+      false
     end
 
     nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes.txt").read)
-    kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"])
+    node_items_test_cache = nodes_api_response["items"]
+
     kubeApiClient.expect(:getClusterName, "/cluster-name")
     kubeApiClient.expect(:getClusterId, "/cluster-id")
+    def appInsightsUtil.sendExceptionTelemetry(exception)
+      if exception.to_s != "undefined method `[]' for nil:NilClass"
+        raise "an unexpected exception has occured"
+      end
+    end
 
     config = "run_interval 999999999"  # only run once
 
-    d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env)
+    d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, node_items_test_cache)
     d.instance.start
     d.instance.enumerate
     d.run(timeout: 99999)  # Input plugins decide when to run, so we have to give it enough time to run
 
-
-    expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true,
-    ["mdm.kubenodeinventory", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true,
-    ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"})] => true,
-    ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]"})] => true,
-    ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]"})] => true,
-    ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]"})] => true,
-    ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]"})] => true}
+    expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true,
+                          ["mdm.kubenodeinventory", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true,
+                          ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" })] => true,
+                          ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]" })] => true,
+                          ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]" })] => true,
+                          ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]" })] => true,
+                          ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]" })] => true }
 
     d.events.each do |tag, time, record|
-        cleaned_record = overwrite_collection_time record
-        if expected_responses.key?([tag, cleaned_record])
-            expected_responses[[tag, cleaned_record]] = true
-        else
-            assert(false, "got unexpected record")
-        end
+      cleaned_record = overwrite_collection_time record
+      if expected_responses.key?([tag, cleaned_record])
+        expected_responses[[tag, cleaned_record]] = true
+      else
+        assert(false, "got unexpected record: #{cleaned_record}")
+      end
     end
 
     expected_responses.each do |key, val|
-        assert(val, "expected record not emitted: #{key}")
+      assert(val, "expected record not emitted: #{key}")
     end
 
     # make sure all mocked methods were called the expected number of times
@@ -104,7 +111,7 @@ def test_malformed_node_spec
     # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking
     # but it doesn't track how many times isAADMSIAuthMode is called
     def extensionUtils.isAADMSIAuthMode
-        false
+      false
     end
 
     # Set up the KubernetesApiClient Mock. Note: most of the functions in KubernetesApiClient are pure (access no
@@ -112,16 +119,17 @@ def extensionUtils.isAADMSIAuthMode
     # more brittle). Instead, in_kube_nodes bypasses the mock and directly calls these functions in KubernetesApiClient.
     # Ideally the pure functions in KubernetesApiClient would be refactored into their own file to reduce confusion.
     nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes-malformed.txt").read)
-    kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"])
+    node_items_test_cache = nodes_api_response["items"]
+
     kubeApiClient.expect(:getClusterName, "/cluster-name")
     kubeApiClient.expect(:getClusterName, "/cluster-name")
     kubeApiClient.expect(:getClusterId, "/cluster-id")
     kubeApiClient.expect(:getClusterId, "/cluster-id")
 
     def appInsightsUtil.sendExceptionTelemetry(exception)
-        if exception.to_s != "undefined method `[]' for nil:NilClass"
-            raise "an unexpected exception has occured"
-        end
+      if exception.to_s != "undefined method `[]' for nil:NilClass"
+        raise "an unexpected exception has occured"
+      end
     end
 
     # This test doesn't care if metric telemetry is sent properly. Looking for an unnecessary value would make it needlessly rigid
@@ -130,38 +138,38 @@ def appInsightsUtil.sendMetricTelemetry(a, b, c)
 
     config = "run_interval 999999999"  # only run once
 
-    d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env, telemetry_flush_interval=0)
+    d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, telemetry_flush_interval = 0, node_items_test_cache)
     d.instance.start
 
     d.instance.enumerate
     d.run(timeout: 99999)  #TODO: is this necessary?
 
     expected_responses = {
-        ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
-        ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
-        ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false,
-        ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]"}] => false,
-        ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]"}] => false,
-        ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]"}] => false,
-        ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]"}] => false,
-
-        # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records)
-        ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
-        ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false,
-        ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false
+      ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+      ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+      ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false,
+      ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]" }] => false,
+      ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]" }] => false,
+      ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]" }] => false,
+      ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]" }] => false,
+
+      # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records)
+      ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+      ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false,
+      ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false,
     }
 
     d.events.each do |tag, time, record|
-        cleaned_record = overwrite_collection_time record
-        if expected_responses.key?([tag, cleaned_record])
-            expected_responses[[tag, cleaned_record]] = true
-        end
-        # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data.
-        # we care more that the non-malformed data is still emitted
+      cleaned_record = overwrite_collection_time record
+      if expected_responses.key?([tag, cleaned_record])
+        expected_responses[[tag, cleaned_record]] = true
+      end
+      # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data.
+      # we care more that the non-malformed data is still emitted
     end
 
     expected_responses.each do |key, val|
-        assert(val, "expected record not emitted: #{key}")
+      assert(val, "expected record not emitted: #{key}")
     end
 
     kubeApiClient.verify

From 81eec6ea39d2306a6fb26fedc53f21112bb3ef9b Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Sat, 11 Jun 2022 09:56:25 -0700
Subject: [PATCH 63/65] exclude unfixed cve until this get fixed

---
 .trivyignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.trivyignore b/.trivyignore
index f8c029116..56ac504d5 100644
--- a/.trivyignore
+++ b/.trivyignore
@@ -16,4 +16,4 @@ CVE-2021-31799
 CVE-2021-28965
 
 #dpkg vulnerability in ubuntu
-CVE-2022-1664
\ No newline at end of file
+CVE-2022-1304
\ No newline at end of file

From 1a3fa0ea200332f8cb6f91c6fb68ea4afd3ef66d Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 13 Jun 2022 12:24:50 -0700
Subject: [PATCH 64/65] fix minor issue

---
 source/plugins/ruby/in_kube_podinventory.rb    | 2 +-
 source/plugins/ruby/in_kube_podmdminventory.rb | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb
index b84b53d28..bdbc465ec 100644
--- a/source/plugins/ruby/in_kube_podinventory.rb
+++ b/source/plugins/ruby/in_kube_podinventory.rb
@@ -1164,7 +1164,7 @@ def watch_windows_nodes
     end
 
     def writeMDMRecords(mdmRecordsJson)
-      maxRetryCount = 3
+      maxRetryCount = 5
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
       begin
diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb
index 5be9bc99c..bfc5227f3 100644
--- a/source/plugins/ruby/in_kube_podmdminventory.rb
+++ b/source/plugins/ruby/in_kube_podmdminventory.rb
@@ -76,8 +76,7 @@ def parse_and_emit_records()
       begin
         $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}"
         mdmPodRecords = getMDMRecords()
-        mdmPodRecordItems =
-          $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
+        $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}"
         if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0
           batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory
           mdmPodRecords["items"].each do |record|

From 7f3372a96dbbd855e1d5db10273d2c3b27d47d72 Mon Sep 17 00:00:00 2001
From: Ganga Mahesh Siddem <gangams@microsoft.com>
Date: Mon, 13 Jun 2022 23:21:23 -0700
Subject: [PATCH 65/65] increase retries to handle transient errors

---
 source/plugins/ruby/in_kube_nodes.rb         | 2 +-
 source/plugins/ruby/in_kube_perfinventory.rb | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb
index 690a1ca8c..a3cbb5a85 100644
--- a/source/plugins/ruby/in_kube_nodes.rb
+++ b/source/plugins/ruby/in_kube_nodes.rb
@@ -752,7 +752,7 @@ def watch_nodes
     end
 
     def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson)
-      maxRetryCount = 3
+      maxRetryCount = 5
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
       begin
diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb
index 50552a25d..ad8fdbf21 100644
--- a/source/plugins/ruby/in_kube_perfinventory.rb
+++ b/source/plugins/ruby/in_kube_perfinventory.rb
@@ -103,7 +103,6 @@ def enumerate(podList = nil)
         end
 
         nodeAllocatableRecords = getNodeAllocatableRecords()
-        $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}")
         # Initializing continuation token to nil
         continuationToken = nil
         podItemsCacheSizeKB = 0
@@ -398,7 +397,7 @@ def watch_pods
     end
 
     def getNodeAllocatableRecords()
-      maxRetryCount = 3
+      maxRetryCount = 5
       initialRetryDelaySecs = 0.5
       retryAttemptCount = 1
       nodeAllocatableRecords = {}
@@ -418,8 +417,8 @@ def getNodeAllocatableRecords()
         if retryAttemptCount < maxRetryCount
           f.flock(File::LOCK_UN) if !f.nil?
           f.close if !f.nil?
-          retryAttemptCount = retryAttemptCount + 1
           sleep (initialRetryDelaySecs * retryAttemptCount)
+          retryAttemptCount = retryAttemptCount + 1
           retry
         end
         $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @  #{Time.now.utc.iso8601}"