From 95f5b6522855643152678e0cb0bd4207f6291ca9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 3 Jan 2022 11:11:40 -0800 Subject: [PATCH 01/65] watch and multiproc implementation --- build/linux/installer/conf/kube.conf | 396 ++++++++++-------- .../installer/datafiles/base_container.data | 3 +- kubernetes/omsagent.yaml | 5 +- source/plugins/ruby/KubernetesApiClient.rb | 385 ++++++++++++++++- source/plugins/ruby/WatchStream.rb | 63 +++ source/plugins/ruby/in_kube_nodes.rb | 288 ++++++++----- source/plugins/ruby/in_kube_podinventory.rb | 305 +++++++++++--- .../ruby/kubernetes_container_inventory.rb | 63 +-- 8 files changed, 1121 insertions(+), 387 deletions(-) create mode 100644 source/plugins/ruby/WatchStream.rb diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index a1c8bf928..1340a27a4 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -1,92 +1,78 @@ - #fluent forward plugin - - @type forward - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - bind 0.0.0.0 - chunk_size_limit 4m - + #fluent forward plugin + + workers 2 + root_dir /var/opt/microsoft/docker-cimprov/state + - #Kubernetes pod inventory - - @type kube_podinventory - tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB - run_interval 60 - @log_level debug - + #perf + + @type forward + @id out_perf_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + - #Kubernetes Persistent Volume inventory - - @type kube_pvinventory - tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes events - - @type kube_events - tag oneagent.containerInsights.KUBE_EVENTS_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes Nodes - - @type kube_nodes - tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB - run_interval 60 - @log_level debug - - - #Kubernetes health - - @type kube_health - tag kubehealth.ReplicaSet - run_interval 60 - @log_level debug - + #custom_metrics_mdm filter plugin for perf data from windows nodes + + @type cadvisor2mdm + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes + @log_level info + - #cadvisor perf- Windows nodes - - @type win_cadvisor_perf - tag oneagent.containerInsights.LINUX_PERF_BLOB - run_interval 60 - @log_level debug - + #containerinventory for windows containers + + @type forward + @id out_ci_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + - #Kubernetes object state - deployments - - @type kubestate_deployments - tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB - run_interval 60 - @log_level debug - - #Kubernetes object state - HPA + + #Kubernetes pod inventory - @type kubestate_hpa - tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + @type kube_podinventory + tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB run_interval 60 @log_level debug - - @type inventory2mdm - @log_level info - - - #custom_metrics_mdm filter plugin for perf data from windows nodes - - @type cadvisor2mdm - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes - @log_level info - - - #health model aggregation filter - - @type health_model_builder - - #kubepodinventory @type forward @@ -108,13 +94,13 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + keepalive true - #kubepvinventory - + #kubeservices + @type forward @log_level debug send_timeout 30 @@ -126,7 +112,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -134,26 +120,18 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 2 - keepalive true - + keepalive true + - #InsightsMetrics - #kubestate - - @type forward + + @type mdm + @id out_mdm_podinventory @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - @type file - path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -161,13 +139,21 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + retry_mdm_post_wait_minutes 30 - #kubeevents - + #Kubernetes Nodes + + @type kube_nodes + tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB + run_interval 60 + @log_level debug + + + #containernodeinventory + @type forward @log_level debug send_timeout 30 @@ -179,7 +165,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer + path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -187,13 +173,18 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 3 keepalive true - - #kubeservices - + + + @type inventory2mdm + @log_level info + + + #kubenodeinventory + @type forward @log_level debug send_timeout 30 @@ -205,7 +196,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -213,25 +204,18 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 2 + flush_thread_count 5 - keepalive true - + keepalive true + - #kubenodeinventory - - @type forward + + @type mdm + @id out_mdm_nodeinventory @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - @type file - path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer + path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -239,13 +223,76 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + retry_mdm_post_wait_minutes 30 + - #containernodeinventory - + + #fluent forward plugin + + @type forward + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + bind 0.0.0.0 + chunk_size_limit 4m + + + #Kubernetes Persistent Volume inventory + + @type kube_pvinventory + tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes events + + @type kube_events + tag oneagent.containerInsights.KUBE_EVENTS_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes health + + @type kube_health + tag kubehealth.ReplicaSet + run_interval 60 + @log_level debug + + + #cadvisor perf- Windows nodes + + @type win_cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes object state - deployments + + @type kubestate_deployments + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + + + #Kubernetes object state - HPA + + @type kubestate_hpa + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + + + #health model aggregation filter + + @type health_model_builder + + + #kubepvinventory + @type forward @log_level debug send_timeout 30 @@ -257,7 +304,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -265,25 +312,26 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 3 + flush_thread_count 5 - keepalive true + keepalive true - #containerinventory for windows containers - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - + #InsightsMetrics + #kubestate + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + @type file - path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -291,13 +339,13 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 keepalive true - + - #perf - + #kubeevents + @type forward @log_level debug send_timeout 30 @@ -309,7 +357,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/state/perf*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -317,17 +365,25 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - keepalive true + keepalive true - - @type mdm - @log_level debug + #kubehealth + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + @type file - path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer + path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 @@ -335,13 +391,14 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 - retry_mdm_post_wait_minutes 30 + keepalive true @type mdm + @id out_mdm_perf @log_level debug @type file @@ -353,33 +410,8 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 retry_mdm_post_wait_minutes 30 - - #kubehealth - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - - - @type file - path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - keepalive true - + diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index d104a5084..a405e760f 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -178,6 +178,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/out_health_forward.rb; source/plugins/ruby/out_health_forward.rb; 644; root; root /etc/fluent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root +/etc/fluent/plugin/WatchStream.rb; source/plugins/ruby/WatchStream.rb; 644; root; root @@ -309,7 +310,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rmdir /etc/opt/microsoft/docker-cimprov/conf 2> /dev/null rmdir /etc/opt/microsoft/docker-cimprov 2> /dev/null rmdir /etc/opt/microsoft 2> /dev/null - rmdir /etc/opt 2> /dev/null + rmdir /etc/opt 2> /dev/null fi %Preinstall_0 diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index a1a843196..2ff9c5249 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -26,7 +26,7 @@ rules: verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] resources: ["replicasets", "deployments", "horizontalpodautoscalers"] - verbs: ["list"] + verbs: ["list", "watch"] - apiGroups: ["azmon.container.insights"] resources: ["healthstates"] verbs: ["get", "create", "patch"] @@ -607,7 +607,7 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 1 + cpu: 2 memory: 1Gi requests: cpu: 150m @@ -927,4 +927,3 @@ spec: names: plural: healthstates kind: HealthState - \ No newline at end of file diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 8925248d7..319129cae 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -11,6 +11,8 @@ class KubernetesApiClient require_relative "oms_common" require_relative "constants" + require_relative "WatchStream" + require_relative "kubernetes_container_inventory" @@ApiVersion = "v1" @@ApiVersionApps = "v1" @@ -88,7 +90,7 @@ def getTokenStr end end - def getClusterRegion(env=ENV) + def getClusterRegion(env = ENV) if env["AKS_REGION"] return env["AKS_REGION"] else @@ -97,7 +99,7 @@ def getClusterRegion(env=ENV) end end - def getResourceUri(resource, api_group, env=ENV) + def getResourceUri(resource, api_group, env = ENV) begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] if api_group.nil? @@ -114,7 +116,7 @@ def getResourceUri(resource, api_group, env=ENV) end end - def getClusterName(env=ENV) + def getClusterName(env = ENV) return @@ClusterName if !@@ClusterName.nil? @@ClusterName = "None" begin @@ -148,7 +150,7 @@ def getClusterName(env=ENV) return @@ClusterName end - def getClusterId(env=ENV) + def getClusterId(env = ENV) return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId @@ -456,19 +458,19 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricCollection = {} metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue - + metricProps["json_Collections"] = [] - metricCollections = [] - metricCollections.push(metricCollection) + metricCollections = [] + metricCollections.push(metricCollection) metricProps["json_Collections"] = metricCollections.to_json - metricItems.push(metricProps) + metricItems.push(metricProps) #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) metricValue = @@NodeMetrics[nodeMetricsHashKey] #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - + metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -481,10 +483,10 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue metricProps["json_Collections"] = [] - metricCollections = [] - metricCollections.push(metricCollection) + metricCollections = [] + metricCollections.push(metricCollection) metricProps["json_Collections"] = metricCollections.to_json - metricItems.push(metricProps) + metricItems.push(metricProps) end end end @@ -615,11 +617,11 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue metricCollections = [] - metricCollections.push(metricCollection) - + metricCollections.push(metricCollection) + metricItem["json_Collections"] = [] metricItem["json_Collections"] = metricCollections.to_json - + #push node level metrics to a inmem hash so that we can use it looking up at container level. #Currently if container level cpu & memory limits are not defined we default to node level limits @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue @@ -778,7 +780,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil) return continuationToken, resourceInventory end #getResourcesAndContinuationToken - def getKubeAPIServerUrl(env=ENV) + def getKubeAPIServerUrl(env = ENV) apiServerUrl = nil begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] @@ -818,5 +820,356 @@ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) end return kubeServiceRecords end + + # Accepts the following options: + # :namespace (string) - the namespace of the entity. + # :name (string) - the name of the entity to watch. + # :label_selector (string) - a selector to restrict the list of returned objects by labels. + # :field_selector (string) - a selector to restrict the list of returned objects by fields. + # :resource_version (string) - shows changes that occur after passed version of a resource. + # :allow_watch_bookmarks (bool) - flag to indicate whether to use bookmark or not. + def watch(resource_name, options = {}) + begin + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + end + http_options = { + use_ssl: true, + open_timeout: 60, + read_timeout: 240, # https://github.com/kubernetes-client/java/issues/1370 https://github.com/kubernetes-client/java/issues/1578 + ca_file: @@CaFile, + verify_mode: OpenSSL::SSL::VERIFY_PEER, + } + http_headers = { + Authorization: "Bearer " + getTokenStr, + } + ns = "" + if !options[:namespace].to_s.empty? + ns = "namespaces/#{namespace}/" + end + path = "watch/#{ns}#{resource_name}" + path += "/#{options[:name]}" if options[:name] + api_endpoint = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + "#{path}" + uri = URI.parse(api_endpoint) + params = {} + WATCH_ARGUMENTS.each { |k, v| params[k] = options[v] if options[v] } + uri.query = URI.encode_www_form(params) if params.any? + watcher = WatchStream.new( + uri, + http_options, + http_headers, + @Log + ) + return watcher unless block_given? + begin + watcher.each(&block) + ensure + watcher.finish if watcher + end + rescue => errorStr + @Log.warn "KubernetesApiClient::watch:Failed with an error : #{errorStr}" + end + end + + def getOptimizedItem(resource, resourceItem, winNodes = []) + case resource + when "pods" + return getPodOptimizedItem(resourceItem, winNodes) + when "nodes" + return getNodeOptimizedItem(resourceItem) + when "services" + return getServiceOptimizedItem(resourceItem) + when "deployments" + return getDeploymentOptimizedItem(resourceItem) + when "horizontalpodautoscalers" + return getHpaOptimizedItem(resourceItem) + else + return resourceItem + end + end + + def getServiceOptimizedItem(resourceItem) + item = {} + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["selector"] = [] + if !resourceItem["spec"]["selector"].nil? + item["spec"]["selector"] = resourceItem["spec"]["selector"] + end + item["spec"]["clusterIP"] = "" + if !resourceItem["spec"]["clusterIP"].nil? + item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"] + end + item["spec"]["type"] = "" + if !resourceItem["spec"]["type"].nil? + item["spec"]["type"] = resourceItem["spec"]["type"] + end + end + return item + end + + def isWindowsPodItem(podItem, winNodes) + isWindowsPod = false + if !winNodes.nil? && !winNodes.empty? + nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" + if !nodeName.empty? && winNodes.include?(nodeName) + isWindowsPod = true + end + end + return isWindowsPod + end + + def getPodOptimizedItem(resourceItem, winNodes) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + if !resourceItem["metadata"]["annotations"].nil? + item["metadata"]["annotations"] = resourceItem["metadata"]["annotations"] + end + if !resourceItem["metadata"]["labels"].nil? + item["metadata"]["labels"] = resourceItem["metadata"]["labels"] + end + if !resourceItem["metadata"]["ownerReferences"].nil? + item["metadata"]["ownerReferences"] = resourceItem["metadata"]["ownerReferences"] + end + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["resourceVersion"] = resourceItem["metadata"]["resourceVersion"] + item["metadata"]["uid"] = resourceItem["metadata"]["uid"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + if !resourceItem["metadata"]["deletionTimestamp"].nil? + item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"] + end + end + isWindowsPod = isWindowsPodItem(resourceItem, winNodes) + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["containers"] = [] + isDisableClusterCollectEnvVar = false + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + isDisableClusterCollectEnvVar = true + end + if !resourceItem["spec"]["containers"].nil? + resourceItem["spec"]["containers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPod + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["EnvironmentVar"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end + end + item["spec"]["containers"].push(currentContainer) + end + end + item["spec"]["initContainers"] = [] + if !resourceItem["spec"]["initContainers"].nil? + resourceItem["spec"]["initContainers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPod + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["EnvironmentVar"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end + end + item["spec"]["initContainers"].push(currentContainer) + end + end + item["spec"]["nodeName"] = "" + if !resourceItem["spec"]["nodeName"].nil? + item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + item["status"] = resourceItem["status"] + if !resourceItem["status"]["startTime"].nil? + item["status"]["startTime"] = resourceItem["status"]["startTime"] + end + if !resourceItem["status"]["reason"].nil? + item["status"]["reason"] = resourceItem["status"]["reason"] + end + if !resourceItem["status"]["podIP"].nil? + item["status"]["podIP"] = resourceItem["status"]["podIP"] + end + if !resourceItem["status"]["phase"].nil? + item["status"]["phase"] = resourceItem["status"]["phase"] + end + item["status"]["conditions"] = [] + if !resourceItem["status"]["conditions"].nil? + resourceItem["status"]["conditions"].each do |condition| + currentCondition = {} + currentCondition["type"] = condition["type"] + currentCondition["status"] = condition["status"] + ## TODO - check if we need this + currentCondition["lastTransitionTime"] = condition["lastTransitionTime"] + item["status"]["conditions"].push(currentCondition) + end + end + item["status"]["initContainerStatuses"] = [] + if !resourceItem["status"]["initContainerStatuses"].nil? + resourceItem["status"]["initContainerStatuses"].each do |containerStatus| + currentContainerStatus = {} + currentContainerStatus["containerID"] = containerStatus["containerID"] + currentContainerStatus["name"] = containerStatus["name"] + currentContainerStatus["restartCount"] = containerStatus["restartCount"] + currentContainerStatus["state"] = containerStatus["state"] + currentContainerStatus["lastState"] = containerStatus["lastState"] + if isWindowsPod + currentContainerStatus["imageID"] = containerStatus["imageID"] + end + item["status"]["initContainerStatuses"].push(currentContainerStatus) + end + end + item["status"]["containerStatuses"] = [] + if !resourceItem["status"]["containerStatuses"].nil? + resourceItem["status"]["containerStatuses"].each do |containerStatus| + currentContainerStatus = {} + currentContainerStatus["containerID"] = containerStatus["containerID"] + currentContainerStatus["name"] = containerStatus["name"] + currentContainerStatus["restartCount"] = containerStatus["restartCount"] + currentContainerStatus["state"] = containerStatus["state"] + currentContainerStatus["lastState"] = containerStatus["lastState"] + if isWindowsPod + currentContainerStatus["imageID"] = containerStatus["imageID"] + end + item["status"]["containerStatuses"].push(currentContainerStatus) + end + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getNodeOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + if !resourceItem["metadata"]["labels"].nil? + item["metadata"]["labels"] = resourceItem["metadata"]["labels"] + end + end + item["spec"] = {} + if !resourceItem["spec"].nil? + if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty? + item["spec"]["providerID"] = resourceItem["spec"]["providerID"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + item["status"]["conditions"] = resourceItem["status"]["conditions"] + item["status"]["nodeInfo"] = {} + nodeInfo = {} + if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty? + nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"] + nodeInfo["kubeProxyVersion"] = resourceItem["status"]["nodeInfo"]["kubeProxyVersion"] + nodeInfo["osImage"] = resourceItem["status"]["nodeInfo"]["osImage"] + nodeInfo["containerRuntimeVersion"] = resourceItem["status"]["nodeInfo"]["containerRuntimeVersion"] + nodeInfo["operatingSystem"] = resourceItem["status"]["nodeInfo"]["operatingSystem"] + nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"] + end + item["status"]["nodeInfo"] = nodeInfo + item["status"]["allocatable"] = resourceItem["status"]["allocatable"] + item["status"]["capacity"] = resourceItem["status"]["capacity"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end + + def getDeploymentOptimizedItem(resourceItem) + item = {} + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["strategy"] = {} + if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil? + item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"] + end + if !resourceItem["spec"]["replicas"].nil? + item["spec"]["replicas"] = resourceItem["spec"]["replicas"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["readyReplicas"].nil? + item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"] + end + if !resourceItem["status"]["updatedReplicas"].nil? + item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"] + end + if !resourceItem["status"]["availableReplicas"].nil? + item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"] + end + end + return item + end + + def getHpaOptimizedItem(resourceItem) + item = {} + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] + end + item["spec"] = {} + if !resourceItem["spec"].nil? + if !resourceItem["spec"]["minReplicas"].nil? + item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"] + end + if !resourceItem["spec"]["maxReplicas"].nil? + item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"] + end + item["spec"]["scaleTargetRef"] = {} + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil? + item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"] + end + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil? + item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"] + end + end + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["currentReplicas"].nil? + item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"] + end + if !resourceItem["status"]["desiredReplicas"].nil? + item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"] + end + if !resourceItem["status"]["lastScaleTime"].nil? + item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"] + end + end + return item + end end end diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb new file mode 100644 index 000000000..6633d26d5 --- /dev/null +++ b/source/plugins/ruby/WatchStream.rb @@ -0,0 +1,63 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "net/http" +require "net/https" +require "yajl/json_gem" +require "logger" +require "time" + +WATCH_ARGUMENTS = { + "labelSelector" => :label_selector, + "fieldSelector" => :field_selector, + "resourceVersion" => :resource_version, + "allowWatchBookmarks" => :allow_watch_bookmarks, + "timeoutSeconds" => :timeout_seconds, +}.freeze + +# HTTP Stream used to watch changes on entities +class WatchStream + def initialize(uri, http_options, http_headers, logger) + @uri = uri + @http_client = nil + @http_options = http_options + @http_headers = http_headers + @logger = logger + @logger.info "WatchStream:initialize @ #{Time.now.utc.iso8601}" + end + + def each + @finished = false + buffer = +"" + @logger.info "WatchStream: Opening TCP session @ #{Time.now.utc.iso8601}" + @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options) + path = @uri.path + if !@uri.query.nil? && !@uri.query.empty? + path += "?" + @uri.query + end + @logger.info "WatchStream: Making GET API call for Watch with path: #{path} @ #{Time.now.utc.iso8601}" + @http_client.request_get(path, @http_headers) do |response| + if !response.nil? && response.code.to_i > 300 + raise "WatchStream: watch connection failed with an http status code: #{response.code}" + end + response.read_body do |chunk| + buffer << chunk + while (line = buffer.slice!(/.+\n/)) + yield(Yajl::Parser.parse(StringIO.new(line.chomp))) + end + end + end + rescue => e + raise e + end + + def finish + begin + @finished = true + @logger.info "WatchStream:finish HTTP session @ #{Time.now.utc.iso8601}" + @http_client.finish if !@http_client.nil? && @http_client.started? + rescue => error + @logger.warn "WatchStream:finish failed with an error: #{error} @ #{Time.now.utc.iso8601}" + end + end +end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a32a32769..707cfbf9d 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -1,17 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - def initialize (kubernetesApiClient=nil, - applicationInsightsUtility=nil, - extensionUtils=nil, - env=nil, - telemetry_flush_interval=nil) + def initialize(kubernetesApiClient = nil, + applicationInsightsUtility = nil, + extensionUtils = nil, + env = nil, + telemetry_flush_interval = nil) super() require "yaml" @@ -36,8 +36,7 @@ def initialize (kubernetesApiClient=nil, @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - - + @@rsPromInterval = @env["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = @env["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@rsPromFieldDropCount = @env["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] @@ -64,6 +63,9 @@ def initialize (kubernetesApiClient=nil, require_relative "constants" @NodeCache = NodeStatsCache.new() + @watchNodesThread = nil + @nodeItemsCache = {} + #@nodeItemsCacheSizeKB = 0 end config_param :run_interval, :time, :default => 60 @@ -97,6 +99,8 @@ def start @finished = false @condition = ConditionVariable.new @mutex = Mutex.new + @nodeCacheMutex = Mutex.new + @watchNodesThread = Thread.new(&method(:watch_nodes)) @thread = Thread.new(&method(:run_periodic)) @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -110,6 +114,7 @@ def shutdown @condition.signal } @thread.join + @watchNodesThread.join super # This super must be at the end of shutdown method end end @@ -138,7 +143,7 @@ def enumerate if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) end - $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") @@ -147,11 +152,11 @@ def enumerate # Initializing continuation token to nil continuationToken = nil - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # KubernetesApiClient.getNodesResourceUri is a pure function, so call it from the actual module instead of from the mock - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = {} + @nodeCacheMutex.synchronize { + nodeInventory["items"] = @nodeItemsCache.values.clone + #@nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 + } nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) @@ -160,21 +165,6 @@ def enumerate else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i - continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") - nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) - if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(nodeInventory, batchTime) - else - $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" - end - end - @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@ -312,80 +302,80 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - begin - properties = getNodeTelemetryProps(item) - properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] - capacityInfo = item["status"]["capacity"] - - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - begin - if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) - properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] - end + #if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + begin + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] - if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) - properties["amdgpus"] = capacityInfo["amd.com/gpu"] - end - rescue => errorStr - $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] end - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength - properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength - end - # telemetry about osm metric settings for replicaset - if (File.file?(@@osmConfigMountPath)) - properties["osmNamespaceCount"] = @@osmNamespaceCount - end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents - end + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength - properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength - end - # telemetry about osm metric settings for replicaset - if (File.file?(@@osmConfigMountPath)) - properties["osmNamespaceCount"] = @@osmNamespaceCount - end - @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true - rescue => errorStr - $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - @applicationInsightsUtility.sendExceptionTelemetry(errorStr) + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true + rescue => errorStr + $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end + #end end if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -566,18 +556,121 @@ def getNodeTelemetryProps(item) end properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE + #properties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB rescue => errorStr $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end return properties end + + def watch_nodes + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + # clear cache before filling the cache with list + @nodeCacheMutex.synchronize { + @nodeItemsCache.clear() + } + continuationToken = nil + $log.info("in_kube_nodes::watch_nodes : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + $log.info("in_kube_nodes::watch_nodes : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory" + end + end + end + $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + @nodeCacheMutex.synchronize { + @nodeItemsCache.delete(key) + } + end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + end end # Kube_Node_Input + class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) # (to reduce code duplication) class NodeCache - - @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + @@RECORD_TIME_TO_LIVE = 60 * 20 # units are seconds, so clear the cache every 20 minutes. def initialize @cacheHash = {} @@ -622,7 +715,7 @@ def clean_cache() end end - nodes_to_remove.each {|node_name| + nodes_to_remove.each { |node_name| @cacheHash.delete(node_name) @timeAdded.delete(node_name) } @@ -630,7 +723,6 @@ def clean_cache() end end # NodeCache - @@cpuCache = NodeCache.new @@memCache = NodeCache.new diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 3f5f4f1cc..1ed91d9cf 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin require_relative "podinventory_to_mdm" @@ -12,7 +12,6 @@ class Kube_PodInventory_Input < Input @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - def initialize super require "yaml" @@ -20,6 +19,7 @@ def initialize require "yajl" require "set" require "time" + require "net/http" require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" @@ -41,6 +41,11 @@ def initialize @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 + @watchPodsThread = nil + @podItemsCache = {} + + @watchServicesThread = nil + @serviceItemsCache = {} @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @@ -79,7 +84,11 @@ def start @finished = false @condition = ConditionVariable.new @mutex = Mutex.new + @podCacheMutex = Mutex.new + @serviceCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @watchPodsThread = Thread.new(&method(:watch_pods)) + @watchServicesThread = Thread.new(&method(:watch_services)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -91,6 +100,7 @@ def shutdown @condition.signal } @thread.join + @watchPodsThread.join super # This super must be at the end of shutdown method end end @@ -110,55 +120,49 @@ def enumerate(podList = nil) @podInventoryE2EProcessingLatencyMs = 0 podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) - end - if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) - end - if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) - end - $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) + end + if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end - # Get services first so that we dont need to make a call for very chunk - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - - if !serviceInfo.nil? - $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceInfo = nil - # service inventory records much smaller and fixed size compared to serviceList - serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) - # updating for telemetry - @serviceCount += serviceRecords.length - serviceList = nil - end + serviceInventory = {} + @serviceCacheMutex.synchronize { + serviceInventory["items"] = @serviceItemsCache.values.clone + } + serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime) + # updating for telemetry + @serviceCount = serviceRecords.length + $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}") # to track e2e processing latency @podsAPIE2ELatencyMs = 0 podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + #podItemsCacheSizeKB = 0 + podInventory = {} + @podCacheMutex.synchronize { + podInventory["items"] = @podItemsCache.values.clone + #podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024 + } podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) @@ -167,21 +171,6 @@ def enumerate(podList = nil) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") - podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) - if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) - else - $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" - end - end - @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil @@ -195,11 +184,13 @@ def enumerate(podList = nil) end # Flush AppInsights telemetry once all the processing is done + telemetryFlush = true if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE + #telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) @@ -673,5 +664,203 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) end return serviceName end + + def watch_pods + podsResourceVersion = nil + isCheckedWindowsNodes = false + loop do + begin + # check if the cluster has windows nodes since windows container records requires inventory specific fields + winNodes = KubernetesApiClient.getWindowsNodesArray() + if !isCheckedWindowsNodes && winNodes.empty? + winNodes = KubernetesApiClient.getWindowsNodes() + isCheckedWindowsNodes = true + end + if podsResourceVersion.nil? + # clear cache before filling the cache with list + @podCacheMutex.synchronize { + @podItemsCache.clear() + } + continuationToken = nil + $log.info("in_kube_podinventory::watch_pods : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") + $log.info("in_kube_podinventory::watch_pods : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + end + end + else + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + end + end + else + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" + end + end + end + $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_podinventory::watch_pods: received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_pods: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + @podCacheMutex.synchronize { + @podItemsCache.delete(key) + } + end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + end + + def watch_services + servicesResourceVersion = nil + loop do + begin + if servicesResourceVersion.nil? + # clear cache before filling the cache with list + @serviceCacheMutex.synchronize { + @serviceItemsCache.clear() + } + $log.info("in_kube_podinventory::watch_services : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") + $log.info("in_kube_podinventory::watch_services : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + if !serviceInfo.nil? + $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) + $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInfo = nil + if (!serviceInventory.nil? && !serviceInventory.empty?) + servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"] + if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_services : number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") + serviceInventory["items"].each do |item| + key = item["metadata"]["uid"] + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + end + end + else + $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory" + end + serviceInventory = nil + end + end + + $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + servicesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + @serviceCacheMutex.synchronize { + @serviceItemsCache.delete(key) + } + end + when "ERROR" + servicesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 82e36c8cc..ffe92ec40 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] - # image can be in any one of below format in spec - # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image imageValue = containerInfoMap["image"] if !imageValue.nil? && !imageValue.empty? # Find delimiters in image format atLocation = imageValue.index("@") - isDigestSpecified = false + isDigestSpecified = false if !atLocation.nil? # repository/image@digest or repository/image:imagetag@digest, image@digest imageValue = imageValue[0..(atLocation - 1)] # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? - containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] end isDigestSpecified = true end @@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !colonLocation.nil? if slashLocation.nil? # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] else # repository/image:imagetag containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] end containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - else + else if slashLocation.nil? # image containerInventoryRecord["Image"] = imageValue @@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa # repo/image containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] - end + end # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names - if isDigestSpecified == false + if isDigestSpecified == false containerInventoryRecord["ImageTag"] = "latest" end - end + end end - + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -199,7 +199,11 @@ def getContainersInfoMap(podItem, isWindows) cmdValue = container["command"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s containerInfoMap["Command"] = cmdValueString - containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + if isWindows + containerInfoMap["EnvironmentVar"] = container["env"] + else + containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + end containersInfoMap[containerName] = containerInfoMap end end @@ -212,47 +216,47 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin - isCGroupPidFetchRequired = false + isCGroupPidFetchRequired = false if !@@containerCGroupCache.has_key?(containerId) - isCGroupPidFetchRequired = true + isCGroupPidFetchRequired = true else cGroupPid = @@containerCGroupCache[containerId] - if cGroupPid.nil? || cGroupPid.empty? + if cGroupPid.nil? || cGroupPid.empty? isCGroupPidFetchRequired = true @@containerCGroupCache.delete(containerId) - elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") isCGroupPidFetchRequired = true - @@containerCGroupCache.delete(containerId) - end + @@containerCGroupCache.delete(containerId) + end end - if isCGroupPidFetchRequired + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if is_number?(cGroupPid) + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] + tempCGroupPid = @@containerCGroupCache[containerId] if tempCGroupPid.to_i > cGroupPid.to_i @@containerCGroupCache[containerId] = cGroupPid end else @@containerCGroupCache[containerId] = cGroupPid - end + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -265,7 +269,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? @@ -376,6 +380,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end + def is_number?(value) true if Integer(value) rescue false end From 30decbb1868de03798c4634f77eccfd7971a1938 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 3 Jan 2022 19:20:09 -0800 Subject: [PATCH 02/65] fix weird bug --- source/plugins/ruby/KubernetesApiClient.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 319129cae..40f80886a 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -1000,7 +1000,6 @@ def getPodOptimizedItem(resourceItem, winNodes) end item["status"] = {} if !resourceItem["status"].nil? - item["status"] = resourceItem["status"] if !resourceItem["status"]["startTime"].nil? item["status"]["startTime"] = resourceItem["status"]["startTime"] end From 540ca90ce032c216ac2bd66069fe0807ad0c60d4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 4 Jan 2022 18:14:50 -0800 Subject: [PATCH 03/65] multiproc support for fluentd --- build/linux/installer/conf/kube.conf | 12 +- kubernetes/linux/main.sh | 380 ++++++++++++++------------- kubernetes/omsagent.yaml | 4 +- 3 files changed, 213 insertions(+), 183 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 1340a27a4..ac9735e20 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -1,6 +1,6 @@ #fluent forward plugin - workers 2 + workers "#{ENV['NUM_OF_FLUENTD_WORKERS']}" root_dir /var/opt/microsoft/docker-cimprov/state @@ -64,7 +64,7 @@ - + #Kubernetes pod inventory @type kube_podinventory @@ -143,7 +143,8 @@ retry_mdm_post_wait_minutes 30 - + + #Kubernetes Nodes @type kube_nodes @@ -228,8 +229,7 @@ retry_mdm_post_wait_minutes 30 - - + #fluent forward plugin @type forward @@ -414,4 +414,4 @@ retry_mdm_post_wait_minutes 30 - + \ No newline at end of file diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a9184ab53..023cc11e4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -15,8 +15,7 @@ waitforlisteneronTCPport() { if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') - while true - do + while true; do if [ $totalsleptsecs -gt $waittimesecs ]; then echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs" return 1 @@ -25,7 +24,7 @@ waitforlisteneronTCPport() { if [ -z "$varlistener" ]; then #echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..." sleep $sleepdurationsecs - totalsleptsecs=$(($totalsleptsecs+1)) + totalsleptsecs=$(($totalsleptsecs + 1)) else echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs" return 0 @@ -57,23 +56,22 @@ checkAgentOnboardingStatus() { successMessage="Loaded data sources" failureMessage="Failed to load data sources into config" fi - while true - do - if [ $totalsleptsecs -gt $waittimesecs ]; then - echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs" - return 1 - fi - - if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then - echo "Onboarding success" - return 0 - elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then - echo "Onboarding Failure: Reason: Failed to onboard the agent" - echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion" - return 1 - fi - sleep $sleepdurationsecs - totalsleptsecs=$(($totalsleptsecs+1)) + while true; do + if [ $totalsleptsecs -gt $waittimesecs ]; then + echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs" + return 1 + fi + + if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then + echo "Onboarding success" + return 0 + elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then + echo "Onboarding Failure: Reason: Failed to onboard the agent" + echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion" + return 1 + fi + sleep $sleepdurationsecs + totalsleptsecs=$(($totalsleptsecs + 1)) done else echo "${FUNCNAME[0]} called with non-numeric arguments<$2>. Required arguments <#wait-time-in-seconds>" @@ -82,7 +80,6 @@ checkAgentOnboardingStatus() { fi } - #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -90,8 +87,8 @@ mkdir -p /var/opt/microsoft/docker-cimprov/state inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' #Run inotify as a daemon to track changes to the mounted configmap for OSM settings. -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then +if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) || + ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' fi @@ -100,58 +97,58 @@ if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" else export customResourceId=$AKS_RESOURCE_ID - echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc + echo "export customResourceId=$AKS_RESOURCE_ID" >>~/.bashrc source ~/.bashrc echo "customResourceId:$customResourceId" export customRegion=$AKS_REGION - echo "export customRegion=$AKS_REGION" >> ~/.bashrc + echo "export customRegion=$AKS_REGION" >>~/.bashrc source ~/.bashrc echo "customRegion:$customRegion" fi #set agent config schema version -if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then +if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then #trim config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" #remove all spaces config_schema_version="${config_schema_version//[[:space:]]/}" #take first 10 characters - config_schema_version="$(echo $config_schema_version| cut -c1-10)" + config_schema_version="$(echo $config_schema_version | cut -c1-10)" export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version - echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc + echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >>~/.bashrc source ~/.bashrc echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION" fi #set agent config file version -if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then +if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/config-version" ]; then #trim config_file_version="$(cat /etc/config/settings/config-version | xargs)" #remove all spaces config_file_version="${config_file_version//[[:space:]]/}" #take first 10 characters - config_file_version="$(echo $config_file_version| cut -c1-10)" + config_file_version="$(echo $config_file_version | cut -c1-10)" export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version - echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc + echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >>~/.bashrc source ~/.bashrc echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi #set OSM config schema version -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then - if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then +if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) || + ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then #trim osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" #remove all spaces osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" #take first 10 characters - osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + osm_config_schema_version="$(echo $osm_config_schema_version | cut -c1-10)" export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version - echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >>~/.bashrc source ~/.bashrc echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" fi @@ -175,7 +172,7 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then # convert the protocol prefix in lowercase for validation proxyprotocol=$(echo $proto | tr "[:upper:]" "[:lower:]") if [ "$proxyprotocol" != "http://" -a "$proxyprotocol" != "https://" ]; then - echo "-e error proxy endpoint should be in this format http(s)://:@:" + echo "-e error proxy endpoint should be in this format http(s)://:@:" fi # remove the protocol url="$(echo ${PROXY_ENDPOINT/$proto/})" @@ -191,53 +188,53 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then port="$(echo $hostport | sed -e 's,^.*:,:,g' -e 's,.*:\([0-9]*\).*,\1,g' -e 's,[^0-9],,g')" if [ -z "$user" -o -z "$pwd" -o -z "$host" -o -z "$port" ]; then - echo "-e error proxy endpoint should be in this format http(s)://:@:" + echo "-e error proxy endpoint should be in this format http(s)://:@:" else - echo "successfully validated provided proxy endpoint is valid and expected format" + echo "successfully validated provided proxy endpoint is valid and expected format" fi - echo $pwd > /opt/microsoft/docker-cimprov/proxy_password + echo $pwd >/opt/microsoft/docker-cimprov/proxy_password export MDSD_PROXY_MODE=application - echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc + echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >>~/.bashrc export MDSD_PROXY_ADDRESS=$proto$hostport - echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc + echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >>~/.bashrc export MDSD_PROXY_USERNAME=$user - echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc + echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >>~/.bashrc export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password - echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc - + echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc + #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD export MDSD_ODS_COMPRESSION_LEVEL=0 - echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc + echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >>~/.bashrc fi if [ ! -z "$PROXY_ENDPOINT" ]; then - echo "Making curl request to oms endpint with domain: $domain and proxy: $PROXY_ENDPOINT" - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT + echo "Making curl request to oms endpint with domain: $domain and proxy: $PROXY_ENDPOINT" + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT else - echo "Making curl request to oms endpint with domain: $domain" - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest + echo "Making curl request to oms endpint with domain: $domain" + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest fi if [ $? -ne 0 ]; then if [ ! -z "$PROXY_ENDPOINT" ]; then - echo "Making curl request to ifconfig.co with proxy: $PROXY_ENDPOINT" - RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT` + echo "Making curl request to ifconfig.co with proxy: $PROXY_ENDPOINT" + RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co --proxy $PROXY_ENDPOINT) else - echo "Making curl request to ifconfig.co" - RET=`curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co` + echo "Making curl request to ifconfig.co" + RET=$(curl --max-time 10 -s -o /dev/null -w "%{http_code}" ifconfig.co) fi if [ $RET -eq 000 ]; then echo "-e error Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster" else # Retrying here to work around network timing issue if [ ! -z "$PROXY_ENDPOINT" ]; then - echo "ifconfig check succeeded, retrying oms endpoint with proxy..." - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT + echo "ifconfig check succeeded, retrying oms endpoint with proxy..." + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest --proxy $PROXY_ENDPOINT else - echo "ifconfig check succeeded, retrying oms endpoint..." - curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest + echo "ifconfig check succeeded, retrying oms endpoint..." + curl --max-time 10 https://$workspaceId.oms.$domain/AgentService.svc/LinuxAgentTopologyRequest fi if [ $? -ne 0 ]; then @@ -253,59 +250,57 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi - # Set environment variable for if public cloud by checking the workspace domain. if [ -z $domain ]; then - ClOUD_ENVIRONMENT="unknown" + ClOUD_ENVIRONMENT="unknown" elif [ $domain == "opinsights.azure.com" ]; then - CLOUD_ENVIRONMENT="azurepubliccloud" + CLOUD_ENVIRONMENT="azurepubliccloud" elif [ $domain == "opinsights.azure.cn" ]; then - CLOUD_ENVIRONMENT="azurechinacloud" + CLOUD_ENVIRONMENT="azurechinacloud" elif [ $domain == "opinsights.azure.us" ]; then - CLOUD_ENVIRONMENT="azureusgovernmentcloud" + CLOUD_ENVIRONMENT="azureusgovernmentcloud" elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then - CLOUD_ENVIRONMENT="usnat" + CLOUD_ENVIRONMENT="usnat" elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then - CLOUD_ENVIRONMENT="ussec" + CLOUD_ENVIRONMENT="ussec" fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT -echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >>~/.bashrc #consisten naming conventions with the windows export DOMAIN=$domain -echo "export DOMAIN=$DOMAIN" >> ~/.bashrc +echo "export DOMAIN=$DOMAIN" >>~/.bashrc export WSID=$workspaceId -echo "export WSID=$WSID" >> ~/.bashrc +echo "export WSID=$WSID" >>~/.bashrc # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) -if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) +if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) for BACKOFF in {1..4}; do - KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL ) + KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL) # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then break else - sleep $((2**$BACKOFF / 4)) # (exponential backoff) + sleep $((2 ** $BACKOFF / 4)) # (exponential backoff) fi done # validate that the retrieved data is an instrumentation key if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) - echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >>~/.bashrc echo "Using cloud-specific instrumentation key" else # no ikey can be retrieved. Disable telemetry and continue export DISABLE_TELEMETRY=true - echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + echo "export DISABLE_TELEMETRY=true" >>~/.bashrc echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" fi fi - aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >>~/.bashrc source ~/.bashrc @@ -314,7 +309,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser.rb cat config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_env_var fi @@ -326,7 +321,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then cat agent_config_env_var | while read line; do #echo $line - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source agent_config_env_var @@ -335,7 +330,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then cat integration_npm_config_env_var | while read line; do #echo $line - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source integration_npm_config_env_var fi @@ -352,18 +347,18 @@ fi if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then cat defaultpromenvvariables-sidecar | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables-sidecar else cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables fi else cat defaultpromenvvariables-rs | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source defaultpromenvvariables-rs fi @@ -371,7 +366,7 @@ fi #Sourcing telemetry environment variable file if it exists if [ -e "telemetry_prom_config_env_var" ]; then cat telemetry_prom_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source telemetry_prom_config_env_var fi @@ -384,20 +379,19 @@ if [ ! -e "/etc/config/kube.conf" ]; then #Sourcing config environment variable file if it exists if [ -e "side_car_fbit_config_env_var" ]; then cat side_car_fbit_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source side_car_fbit_config_env_var fi fi fi - #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_mdm_metrics_env_var @@ -405,19 +399,19 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source config_metric_collection_env_var fi # OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then +if [[ ((! -e "/etc/config/kube.conf") && ("${CONTAINER_TYPE}" == "PrometheusSidecar")) || + ((-e "/etc/config/kube.conf") && ("${SIDECAR_SCRAPING_ENABLED}" == "false")) ]]; then /usr/bin/ruby2.6 tomlparser-osm-config.rb if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source integration_osm_config_env_var fi @@ -427,7 +421,7 @@ fi echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 cAdvisorIsSecure=false -RET_CODE=`wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}'` +RET_CODE=$(wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}') if [ $RET_CODE -eq 200 ]; then cAdvisorIsSecure=true fi @@ -439,17 +433,17 @@ export NODE_NAME="" if [ "$cAdvisorIsSecure" = true ]; then echo "Wget request using port 10250 succeeded. Using 10250" export IS_SECURE_CADVISOR_PORT=true - echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc + echo "export IS_SECURE_CADVISOR_PORT=true" >>~/.bashrc export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" - echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc + echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >>~/.bashrc echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else echo "Wget request using port 10250 failed. Using port 10255" export IS_SECURE_CADVISOR_PORT=false - echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc + echo "export IS_SECURE_CADVISOR_PORT=false" >>~/.bashrc export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" - echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc + echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >>~/.bashrc echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') fi @@ -461,13 +455,13 @@ if [ ! -z "$podWithValidContainerId" ]; then containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]") nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]") # update runtime only if its not empty, not null and not startswith docker - if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then + if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null" elif [[ $containerRuntime != docker* ]]; then export CONTAINER_RUNTIME=$containerRuntime fi - if [ -z "$nodeName" -o "$nodeName" == null ]; then + if [ -z "$nodeName" -o "$nodeName" == null ]; then echo "-e error nodeName in /pods API response is empty" else export NODE_NAME=$nodeName @@ -477,31 +471,31 @@ else fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME -echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc +echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >>~/.bashrc export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" -echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >>~/.bashrc export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" -echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >>~/.bashrc # default to docker metrics export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors" if [ "$CONTAINER_RUNTIME" != "docker" ]; then - # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 - export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" + # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 + export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6 -echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc -echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >>~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >>~/.bashrc source ~/.bashrc -echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname +echo $NODE_NAME >/var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. cat /var/opt/microsoft/docker-cimprov/state/containerhostname @@ -514,87 +508,120 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION -echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc #skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc # this used by mdsd to determine cloud specific LA endpoints export OMS_TLD=$domain -echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +echo "export OMS_TLD=$OMS_TLD" >>~/.bashrc cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc + echo $line >>~/.bashrc done source /etc/mdsd.d/envmdsd MDSD_AAD_MSI_AUTH_ARGS="" # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH export AAD_MSI_AUTH_MODE=false if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then - echo "*** activating oneagent in aad auth msi mode ***" - # msi auth specific args - MDSD_AAD_MSI_AUTH_ARGS="-a -A" - export AAD_MSI_AUTH_MODE=true - echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc - # this used by mdsd to determine the cloud specific AMCS endpoints - export customEnvironment=$CLOUD_ENVIRONMENT - echo "export customEnvironment=$customEnvironment" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="28230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - export ENABLE_MCS="true" - echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc - export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" - echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc - export MDSD_USE_LOCAL_PERSISTENCY="false" - echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc + echo "*** activating oneagent in aad auth msi mode ***" + # msi auth specific args + MDSD_AAD_MSI_AUTH_ARGS="-a -A" + export AAD_MSI_AUTH_MODE=true + echo "export AAD_MSI_AUTH_MODE=true" >>~/.bashrc + # this used by mdsd to determine the cloud specific AMCS endpoints + export customEnvironment=$CLOUD_ENVIRONMENT + echo "export customEnvironment=$customEnvironment" >>~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="28230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc + export ENABLE_MCS="true" + echo "export ENABLE_MCS=$ENABLE_MCS" >>~/.bashrc + export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" + echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >>~/.bashrc + export MDSD_USE_LOCAL_PERSISTENCY="false" + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >>~/.bashrc else - echo "*** activating oneagent in legacy auth mode ***" - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - #use the file path as its secure than env - CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile - echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="29230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + echo "*** activating oneagent in legacy auth mode ***" + CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" + #use the file path as its secure than env + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >>~/.bashrc + export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile + echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >>~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="29230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc fi source ~/.bashrc dpkg -l | grep mdsd | awk '{print $2 " " $3}' if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." - #use tenant name to avoid unix socket conflict and different ports for port conflict - #roleprefix to use container specific mdsd socket - export TENANT_NAME="${CONTAINER_TYPE}" - echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc - export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default - echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc - source ~/.bashrc - mkdir /var/run/mdsd-${CONTAINER_TYPE} - # add -T 0xFFFF for full traces - mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & else - echo "starting mdsd mode in main container..." - # add -T 0xFFFF for full traces - mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>> /dev/null & + echo "starting mdsd mode in main container..." + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null & fi # Set up a cron job for logrotation if [ ! -f /etc/cron.d/ci-agent ]; then - echo "setting up cronjob for ci agent log rotation" - echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" > /etc/cron.d/ci-agent + echo "setting up cronjob for ci agent log rotation" + echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" >/etc/cron.d/ci-agent fi # no dependency on fluentd for prometheus side car container if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then - echo "*** starting fluentd v1 in daemonset" - fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & + echo "*** starting fluentd v1 in daemonset" + fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else - echo "*** starting fluentd v1 in replicaset" - fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & + case $NUM_OF_FLUENTD_WORKERS in + 3) + export NUM_OF_FLUENTD_WORKERS=3 + export FLUENTD_POD_INVENTORY_WORKER_ID=2 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + 2) + export NUM_OF_FLUENTD_WORKERS=2 + export FLUENTD_POD_INVENTORY_WORKER_ID=1 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + + *) + export NUM_OF_FLUENTD_WORKERS=1 + export FLUENTD_POD_INVENTORY_WORKER_ID=0 + export FLUENTD_NODE_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + esac + echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc + echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc + source ~/.bashrc + + echo "*** fluentd worker configuration ***" + echo "num of workers:${NUM_OF_FLUENTD_WORKERS}" + echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" + echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" + echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" + + echo "*** starting fluentd v1 in replicaset" + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi fi @@ -621,13 +648,13 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi else if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - echo "Moving test conf file to telegraf replicaset conf since test run succeeded" - fi - echo "****************End Telegraf Run in Test Mode**************************" + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + echo "Moving test conf file to telegraf replicaset conf since test run succeeded" + fi + echo "****************End Telegraf Run in Test Mode**************************" fi fi @@ -671,15 +698,15 @@ else fi export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id -echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc +echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >>~/.bashrc export TELEMETRY_AKS_REGION=$telemetry_aks_region -echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc +echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >>~/.bashrc export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name -echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc +echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >>~/.bashrc export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name -echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc +echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >>~/.bashrc export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type -echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc +echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >>~/.bashrc #if [ ! -e "/etc/config/kube.conf" ]; then # nodename=$(cat /hostfs/etc/hostname) @@ -691,15 +718,15 @@ echo "replacing nodename in telegraf config" sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile export HOST_MOUNT_PREFIX=/hostfs -echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc +echo "export HOST_MOUNT_PREFIX=/hostfs" >>~/.bashrc export HOST_PROC=/hostfs/proc -echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc +echo "export HOST_PROC=/hostfs/proc" >>~/.bashrc export HOST_SYS=/hostfs/sys -echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc +echo "export HOST_SYS=/hostfs/sys" >>~/.bashrc export HOST_ETC=/hostfs/etc -echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc +echo "export HOST_ETC=/hostfs/etc" >>~/.bashrc export HOST_VAR=/hostfs/var -echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc +echo "export HOST_VAR=/hostfs/var" >>~/.bashrc if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then @@ -735,9 +762,10 @@ service rsyslog status checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30 shutdown() { - pkill -f mdsd - } + pkill -f mdsd +} trap "shutdown" SIGTERM -sleep inf & wait +sleep inf & +wait diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 2ff9c5249..9f9082dd2 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -607,12 +607,14 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 2 + cpu: 3 memory: 1Gi requests: cpu: 150m memory: 250Mi env: + - name: NUM_OF_FLUENTD_WORKERS + value: "3" # This value should be same as number of CPU cores specified under limits - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION From be3436ed9813019400f190ee6ed6f02d250df5db Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 4 Jan 2022 19:11:28 -0800 Subject: [PATCH 04/65] working --- build/linux/installer/conf/kube.conf | 2 +- source/plugins/ruby/in_kube_nodes.rb | 130 ++++++++++---------- source/plugins/ruby/in_kube_podinventory.rb | 1 - 3 files changed, 66 insertions(+), 67 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index ac9735e20..10a271d99 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -414,4 +414,4 @@ retry_mdm_post_wait_minutes 30 - \ No newline at end of file + diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 707cfbf9d..332066783 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -302,80 +302,80 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - #if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - begin - properties = getNodeTelemetryProps(item) - properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] - capacityInfo = item["status"]["capacity"] - - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) begin - if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) - properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] - end + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] + + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + end - if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) - properties["amdgpus"] = capacityInfo["amd.com/gpu"] + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] + end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - rescue => errorStr - $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents - end + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength - properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength - end - # telemetry about osm metric settings for replicaset - if (File.file?(@@osmConfigMountPath)) - properties["osmNamespaceCount"] = @@osmNamespaceCount - end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents - end + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength - properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength - end - # telemetry about osm metric settings for replicaset - if (File.file?(@@osmConfigMountPath)) - properties["osmNamespaceCount"] = @@osmNamespaceCount + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true + rescue => errorStr + $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end - @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true - rescue => errorStr - $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end - #end end if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 1ed91d9cf..0ae02eea7 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -184,7 +184,6 @@ def enumerate(podList = nil) end # Flush AppInsights telemetry once all the processing is done - telemetryFlush = true if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName From c8ca6e56281bcb865817c760ebd4025ab1d66905 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 4 Jan 2022 23:14:20 -0800 Subject: [PATCH 05/65] fix log lines --- source/plugins/ruby/in_kube_nodes.rb | 6 +++--- source/plugins/ruby/in_kube_podinventory.rb | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 332066783..4708aed64 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -573,14 +573,14 @@ def watch_nodes @nodeItemsCache.clear() } continuationToken = nil - $log.info("in_kube_nodes::watch_nodes : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::watch_nodes : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty?) nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["uid"] nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 0ae02eea7..0461ad211 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -675,19 +675,20 @@ def watch_pods winNodes = KubernetesApiClient.getWindowsNodes() isCheckedWindowsNodes = true end + $log.info("in_kube_podinventory::watch_pods:number of windows nodes: #{winNodes.length} @ #{Time.now.utc.iso8601}") if podsResourceVersion.nil? # clear cache before filling the cache with list @podCacheMutex.synchronize { @podItemsCache.clear() } continuationToken = nil - $log.info("in_kube_podinventory::watch_pods : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") - $log.info("in_kube_podinventory::watch_pods : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty?) podsResourceVersion = podInventory["metadata"]["resourceVersion"] if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) @@ -704,7 +705,7 @@ def watch_pods if (!podInventory.nil? && !podInventory.empty?) podsResourceVersion = podInventory["metadata"]["resourceVersion"] if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_pods : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) @@ -732,9 +733,9 @@ def watch_pods !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? podsResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_pods: received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_podinventory::watch_pods: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") podsResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break @@ -759,6 +760,7 @@ def watch_pods $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end rescue Net::ReadTimeout => errorStr $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") From 6bee9547a0f7c365fb4735b073060fed74ee006b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 5 Jan 2022 21:55:57 -0800 Subject: [PATCH 06/65] refactor code --- kubernetes/omsagent.yaml | 2 +- source/plugins/ruby/KubernetesApiClient.rb | 154 ++++++------ source/plugins/ruby/WatchStream.rb | 25 +- source/plugins/ruby/in_kube_nodes.rb | 131 ++++++---- source/plugins/ruby/in_kube_podinventory.rb | 256 ++++++++++++-------- 5 files changed, 339 insertions(+), 229 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 9f9082dd2..95f9cf636 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -608,7 +608,7 @@ spec: resources: limits: cpu: 3 - memory: 1Gi + memory: 1.5Gi requests: cpu: 150m memory: 250Mi diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 40f80886a..dedf3c653 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -867,7 +867,7 @@ def watch(resource_name, options = {}) watcher.finish if watcher end rescue => errorStr - @Log.warn "KubernetesApiClient::watch:Failed with an error : #{errorStr}" + @Log.warn "KubernetesApiClient::watch:Failed with an error: #{errorStr}" end end @@ -890,25 +890,29 @@ def getOptimizedItem(resource, resourceItem, winNodes = []) def getServiceOptimizedItem(resourceItem) item = {} - item["metadata"] = {} - if !resourceItem["metadata"].nil? - item["metadata"]["name"] = resourceItem["metadata"]["name"] - item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] - end - item["spec"] = {} - if !resourceItem["spec"].nil? - item["spec"]["selector"] = [] - if !resourceItem["spec"]["selector"].nil? - item["spec"]["selector"] = resourceItem["spec"]["selector"] - end - item["spec"]["clusterIP"] = "" - if !resourceItem["spec"]["clusterIP"].nil? - item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"] + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] end - item["spec"]["type"] = "" - if !resourceItem["spec"]["type"].nil? - item["spec"]["type"] = resourceItem["spec"]["type"] + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["selector"] = [] + if !resourceItem["spec"]["selector"].nil? + item["spec"]["selector"] = resourceItem["spec"]["selector"] + end + item["spec"]["clusterIP"] = "" + if !resourceItem["spec"]["clusterIP"].nil? + item["spec"]["clusterIP"] = resourceItem["spec"]["clusterIP"] + end + item["spec"]["type"] = "" + if !resourceItem["spec"]["type"].nil? + item["spec"]["type"] = resourceItem["spec"]["type"] + end end + rescue => errorStr + @Log.warn "KubernetesApiClient::getServiceOptimizedItem:Failed with an error : #{errorStr}" end return item end @@ -1102,71 +1106,79 @@ def getNodeOptimizedItem(resourceItem) def getDeploymentOptimizedItem(resourceItem) item = {} - item["metadata"] = {} - if !resourceItem["metadata"].nil? - item["metadata"]["name"] = resourceItem["metadata"]["name"] - item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] - end - item["spec"] = {} - if !resourceItem["spec"].nil? - item["spec"]["strategy"] = {} - if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil? - item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"] - end - if !resourceItem["spec"]["replicas"].nil? - item["spec"]["replicas"] = resourceItem["spec"]["replicas"] - end - end - item["status"] = {} - if !resourceItem["status"].nil? - if !resourceItem["status"]["readyReplicas"].nil? - item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"] + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] end - if !resourceItem["status"]["updatedReplicas"].nil? - item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"] + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["strategy"] = {} + if !resourceItem["spec"]["strategy"].nil? && !resourceItem["spec"]["strategy"].empty? && !resourceItem["spec"]["strategy"]["type"].nil? + item["spec"]["strategy"]["type"] = resourceItem["spec"]["strategy"]["type"] + end + if !resourceItem["spec"]["replicas"].nil? + item["spec"]["replicas"] = resourceItem["spec"]["replicas"] + end end - if !resourceItem["status"]["availableReplicas"].nil? - item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"] + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["readyReplicas"].nil? + item["status"]["readyReplicas"] = resourceItem["status"]["readyReplicas"] + end + if !resourceItem["status"]["updatedReplicas"].nil? + item["status"]["updatedReplicas"] = resourceItem["status"]["updatedReplicas"] + end + if !resourceItem["status"]["availableReplicas"].nil? + item["status"]["availableReplicas"] = resourceItem["status"]["availableReplicas"] + end end + rescue => errorStr + @Log.warn "KubernetesApiClient::getDeploymentOptimizedItem:Failed with an error : #{errorStr}" end return item end def getHpaOptimizedItem(resourceItem) item = {} - item["metadata"] = {} - if !resourceItem["metadata"].nil? - item["metadata"]["name"] = resourceItem["metadata"]["name"] - item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] - item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] - end - item["spec"] = {} - if !resourceItem["spec"].nil? - if !resourceItem["spec"]["minReplicas"].nil? - item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"] - end - if !resourceItem["spec"]["maxReplicas"].nil? - item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"] - end - item["spec"]["scaleTargetRef"] = {} - if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil? - item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"] - end - if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil? - item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"] - end - end - item["status"] = {} - if !resourceItem["status"].nil? - if !resourceItem["status"]["currentReplicas"].nil? - item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"] + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] end - if !resourceItem["status"]["desiredReplicas"].nil? - item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"] + item["spec"] = {} + if !resourceItem["spec"].nil? + if !resourceItem["spec"]["minReplicas"].nil? + item["spec"]["minReplicas"] = resourceItem["spec"]["minReplicas"] + end + if !resourceItem["spec"]["maxReplicas"].nil? + item["spec"]["maxReplicas"] = resourceItem["spec"]["maxReplicas"] + end + item["spec"]["scaleTargetRef"] = {} + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["kind"].nil? + item["spec"]["scaleTargetRef"]["kind"] = resourceItem["spec"]["scaleTargetRef"]["kind"] + end + if !resourceItem["spec"]["scaleTargetRef"].nil? && !resourceItem["spec"]["scaleTargetRef"]["name"].nil? + item["spec"]["scaleTargetRef"]["name"] = resourceItem["spec"]["scaleTargetRef"]["name"] + end end - if !resourceItem["status"]["lastScaleTime"].nil? - item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"] + item["status"] = {} + if !resourceItem["status"].nil? + if !resourceItem["status"]["currentReplicas"].nil? + item["status"]["currentReplicas"] = resourceItem["status"]["currentReplicas"] + end + if !resourceItem["status"]["desiredReplicas"].nil? + item["status"]["desiredReplicas"] = resourceItem["status"]["desiredReplicas"] + end + if !resourceItem["status"]["lastScaleTime"].nil? + item["status"]["lastScaleTime"] = resourceItem["status"]["lastScaleTime"] + end end + rescue => errorStr + @Log.warn "KubernetesApiClient::getHpaOptimizedItem:Failed with an error : #{errorStr}" end return item end diff --git a/source/plugins/ruby/WatchStream.rb b/source/plugins/ruby/WatchStream.rb index 6633d26d5..6cc850450 100644 --- a/source/plugins/ruby/WatchStream.rb +++ b/source/plugins/ruby/WatchStream.rb @@ -23,22 +23,29 @@ def initialize(uri, http_options, http_headers, logger) @http_options = http_options @http_headers = http_headers @logger = logger - @logger.info "WatchStream:initialize @ #{Time.now.utc.iso8601}" + @path = "" + @logger.info "WatchStream::initialize @ #{Time.now.utc.iso8601}" end def each @finished = false buffer = +"" - @logger.info "WatchStream: Opening TCP session @ #{Time.now.utc.iso8601}" + @logger.info "WatchStream::each:Opening TCP session @ #{Time.now.utc.iso8601}" @http_client = Net::HTTP.start(@uri.host, @uri.port, @http_options) - path = @uri.path + if @http_client.nil? + raise "WatchStream::each:Failed to create HTTPClient object @ #{Time.now.utc.iso8601}" + end + @path = @uri.path + if @path.nil? || @path.empty? + raise "WatchStream::each:URI path should not be empty or nil @ #{Time.now.utc.iso8601}" + end if !@uri.query.nil? && !@uri.query.empty? - path += "?" + @uri.query + @path += "?" + @uri.query end - @logger.info "WatchStream: Making GET API call for Watch with path: #{path} @ #{Time.now.utc.iso8601}" - @http_client.request_get(path, @http_headers) do |response| + @logger.info "WatchStream::each:Making GET API call for Watch with path: #{@path} @ #{Time.now.utc.iso8601}" + @http_client.request_get(@path, @http_headers) do |response| if !response.nil? && response.code.to_i > 300 - raise "WatchStream: watch connection failed with an http status code: #{response.code}" + raise "WatchStream::each:Watch connection of the path: #{@path} failed with an http status code: #{response.code} @ #{Time.now.utc.iso8601}" end response.read_body do |chunk| buffer << chunk @@ -54,10 +61,10 @@ def each def finish begin @finished = true - @logger.info "WatchStream:finish HTTP session @ #{Time.now.utc.iso8601}" + @logger.info "WatchStream::finish:Closing HTTP session of the path:#{@path} @ #{Time.now.utc.iso8601}" @http_client.finish if !@http_client.nil? && @http_client.started? rescue => error - @logger.warn "WatchStream:finish failed with an error: #{error} @ #{Time.now.utc.iso8601}" + @logger.warn "WatchStream::finish:Closing of HTTP session of the path: #{@path} failed with an error: #{error} @ #{Time.now.utc.iso8601}" end end end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 4708aed64..3c49ebabf 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -564,6 +564,7 @@ def getNodeTelemetryProps(item) end def watch_nodes + $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil loop do begin @@ -583,10 +584,18 @@ def watch_nodes $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["uid"] - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem - } + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + end end end else @@ -600,10 +609,18 @@ def watch_nodes $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["uid"] - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem - } + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + end end end else @@ -611,58 +628,74 @@ def watch_nodes end end end - $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - nodesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + begin + $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache.delete(key) + } + end + end + when "ERROR" nodesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") break + else + $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem - } - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - @nodeCacheMutex.synchronize { - @nodeItemsCache.delete(key) - } - end - when "ERROR" - nodesResourceVersion = nil - $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end end + $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}") end end # Kube_Node_Input diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 0461ad211..dde92236d 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -665,6 +665,7 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) end def watch_pods + $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") podsResourceVersion = nil isCheckedWindowsNodes = false loop do @@ -691,10 +692,18 @@ def watch_pods $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) - @podCacheMutex.synchronize { - @podItemsCache[key] = podItem - } + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + end end end else @@ -708,10 +717,18 @@ def watch_pods $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) - @podCacheMutex.synchronize { - @podItemsCache[key] = podItem - } + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + end end end else @@ -719,62 +736,80 @@ def watch_pods end end end - $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - podsResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + begin + $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @podCacheMutex.synchronize { + @podItemsCache.delete(key) + } + end + end + when "ERROR" podsResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") break + else + $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) - @podCacheMutex.synchronize { - @podItemsCache[key] = podItem - } - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - @podCacheMutex.synchronize { - @podItemsCache.delete(key) - } - end - when "ERROR" - podsResourceVersion = nil - $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end + $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end - $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") podsResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end end + $log.info("in_kube_podinventory::watch_pods:End @ #{Time.now.utc.iso8601}") end def watch_services + $log.info("in_kube_podinventory::watch_services:Start @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil loop do begin @@ -783,9 +818,9 @@ def watch_services @serviceCacheMutex.synchronize { @serviceItemsCache.clear() } - $log.info("in_kube_podinventory::watch_services : Getting services from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - $log.info("in_kube_podinventory::watch_services : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}") if !serviceInfo.nil? $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) @@ -794,13 +829,21 @@ def watch_services if (!serviceInventory.nil? && !serviceInventory.empty?) servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"] if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_services : number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") serviceInventory["items"].each do |item| key = item["metadata"]["uid"] - serviceItem = KubernetesApiClient.getOptimizedItem("services", item) - @serviceCacheMutex.synchronize { - @serviceItemsCache[key] = serviceItem - } + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty" + end + else + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty" + end end end else @@ -809,59 +852,74 @@ def watch_services serviceInventory = nil end end - - $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - servicesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + begin + $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + servicesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty" + end + else + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache.delete(key) + } + end + end + when "ERROR" servicesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") break + else + $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - serviceItem = KubernetesApiClient.getOptimizedItem("services", item) - @serviceCacheMutex.synchronize { - @serviceItemsCache[key] = serviceItem - } - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - @serviceCacheMutex.synchronize { - @serviceItemsCache.delete(key) - } - end - when "ERROR" - servicesResourceVersion = nil - $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end end + $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}") end end # Kube_Pod_Input end # module From 0593d020bf989f32a607d91c3284e95aded9a56b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 7 Jan 2022 10:50:34 -0800 Subject: [PATCH 07/65] cache telemetry --- kubernetes/omsagent.yaml | 2 ++ source/plugins/ruby/KubernetesApiClient.rb | 8 ++++++++ source/plugins/ruby/in_kube_nodes.rb | 15 ++++++++++----- source/plugins/ruby/in_kube_podinventory.rb | 15 ++++++++++++--- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 95f9cf636..d5545f041 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -392,6 +392,8 @@ spec: # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" + - name: EMIT_CACHE_TELEMETRY + value: "false" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index dedf3c653..003dab9cf 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -1182,5 +1182,13 @@ def getHpaOptimizedItem(resourceItem) end return item end + + def isEmitCacheTelemetry + isEmitCacheTelemtryEnabled = false + if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true" + isEmitCacheTelemtryEnabled = true + end + return isEmitCacheTelemtryEnabled + end end end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 3c49ebabf..2d3417622 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -65,7 +65,6 @@ def initialize(kubernetesApiClient = nil, @NodeCache = NodeStatsCache.new() @watchNodesThread = nil @nodeItemsCache = {} - #@nodeItemsCacheSizeKB = 0 end config_param :run_interval, :time, :default => 60 @@ -153,9 +152,12 @@ def enumerate # Initializing continuation token to nil continuationToken = nil nodeInventory = {} + nodeItemsCacheSizeKB = 0 @nodeCacheMutex.synchronize { nodeInventory["items"] = @nodeItemsCache.values.clone - #@nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 + if KubernetesApiClient.isEmitCacheTelemetry() + nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 + end } nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) @@ -169,8 +171,12 @@ def enumerate timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) - @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + telemetryProperties = {} + if KubernetesApiClient.isEmitCacheTelemetry() + telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB + end + @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, telemetryProperties) + @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, telemetryProperties) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in @@ -556,7 +562,6 @@ def getNodeTelemetryProps(item) end properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE - #properties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB rescue => errorStr $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index dde92236d..7ed5e29cf 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -144,8 +144,12 @@ def enumerate(podList = nil) end serviceInventory = {} + serviceItemsCacheSizeKB = 0 @serviceCacheMutex.synchronize { serviceInventory["items"] = @serviceItemsCache.values.clone + if KubernetesApiClient.isEmitCacheTelemetry() + serviceItemsCacheSizeKB = @serviceItemsCache.to_s.length / 1024 + end } serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceInventory, batchTime) # updating for telemetry @@ -157,11 +161,13 @@ def enumerate(podList = nil) podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil - #podItemsCacheSizeKB = 0 + podItemsCacheSizeKB = 0 podInventory = {} @podCacheMutex.synchronize { podInventory["items"] = @podItemsCache.values.clone - #podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024 + if KubernetesApiClient.isEmitCacheTelemetry() + podItemsCacheSizeKB = @podItemsCache.to_s.length / 1024 + end } podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) @@ -189,7 +195,10 @@ def enumerate(podList = nil) telemetryProperties["Computer"] = @@hostName telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE - #telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB + if KubernetesApiClient.isEmitCacheTelemetry() + telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB + telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB + end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) From 3f11a273f9473baa97731c7176a248ba129ae134 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 7 Jan 2022 18:06:21 -0800 Subject: [PATCH 08/65] nodecount telemetry --- source/plugins/ruby/in_kube_nodes.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 2d3417622..997167780 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -153,6 +153,7 @@ def enumerate continuationToken = nil nodeInventory = {} nodeItemsCacheSizeKB = 0 + nodeCount = 0 @nodeCacheMutex.synchronize { nodeInventory["items"] = @nodeItemsCache.values.clone if KubernetesApiClient.isEmitCacheTelemetry() @@ -162,7 +163,8 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeCount = nodeInventory["items"].length + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeCount} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" @@ -171,12 +173,13 @@ def enumerate timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) + @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) telemetryProperties = {} if KubernetesApiClient.isEmitCacheTelemetry() telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB end - @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, telemetryProperties) - @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in From 3752459a65d2c9392609dbb4ba8a30cdf8f779bb Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 8 Jan 2022 11:26:32 -0800 Subject: [PATCH 09/65] bug fix --- source/plugins/ruby/KubernetesApiClient.rb | 27 ++++++++++++--------- source/plugins/ruby/in_kube_podinventory.rb | 16 ++++-------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 003dab9cf..348d2e7ba 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -871,10 +871,10 @@ def watch(resource_name, options = {}) end end - def getOptimizedItem(resource, resourceItem, winNodes = []) + def getOptimizedItem(resource, resourceItem) case resource when "pods" - return getPodOptimizedItem(resourceItem, winNodes) + return getPodOptimizedItem(resourceItem) when "nodes" return getNodeOptimizedItem(resourceItem) when "services" @@ -917,18 +917,23 @@ def getServiceOptimizedItem(resourceItem) return item end - def isWindowsPodItem(podItem, winNodes) + def isWindowsPodItem(podItem) isWindowsPod = false - if !winNodes.nil? && !winNodes.empty? - nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" - if !nodeName.empty? && winNodes.include?(nodeName) - isWindowsPod = true + begin + winNodes = KubernetesApiClient.getWindowsNodesArray() + if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0 + nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" + if !nodeName.empty? && winNodes.include?(nodeName) + isWindowsPod = true + end end + rescue => errorStr + $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" end return isWindowsPod end - def getPodOptimizedItem(resourceItem, winNodes) + def getPodOptimizedItem(resourceItem) item = {} begin item["metadata"] = {} @@ -951,7 +956,7 @@ def getPodOptimizedItem(resourceItem, winNodes) item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"] end end - isWindowsPod = isWindowsPodItem(resourceItem, winNodes) + isWindowsPod = isWindowsPodItem(resourceItem) item["spec"] = {} if !resourceItem["spec"].nil? item["spec"]["containers"] = [] @@ -970,7 +975,7 @@ def getPodOptimizedItem(resourceItem, winNodes) currentContainer["image"] = container["image"] currentContainer["ports"] = container["ports"] currentContainer["command"] = container["command"] - currentContainer["EnvironmentVar"] = "" + currentContainer["env"] = "" if !isDisableClusterCollectEnvVar currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) end @@ -989,7 +994,7 @@ def getPodOptimizedItem(resourceItem, winNodes) currentContainer["image"] = container["image"] currentContainer["ports"] = container["ports"] currentContainer["command"] = container["command"] - currentContainer["EnvironmentVar"] = "" + currentContainer["env"] = "" if !isDisableClusterCollectEnvVar currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 7ed5e29cf..4103fcd33 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -676,16 +676,10 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) def watch_pods $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") podsResourceVersion = nil - isCheckedWindowsNodes = false + # invoke getWindowsNodes to get windowsnodearray cache populated + KubernetesApiClient.getWindowsNodes() loop do begin - # check if the cluster has windows nodes since windows container records requires inventory specific fields - winNodes = KubernetesApiClient.getWindowsNodesArray() - if !isCheckedWindowsNodes && winNodes.empty? - winNodes = KubernetesApiClient.getWindowsNodes() - isCheckedWindowsNodes = true - end - $log.info("in_kube_podinventory::watch_pods:number of windows nodes: #{winNodes.length} @ #{Time.now.utc.iso8601}") if podsResourceVersion.nil? # clear cache before filling the cache with list @podCacheMutex.synchronize { @@ -702,7 +696,7 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + podItem = KubernetesApiClient.getOptimizedItem("pods", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -727,7 +721,7 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + podItem = KubernetesApiClient.getOptimizedItem("pods", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -770,7 +764,7 @@ def watch_pods if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item, winNodes) + podItem = KubernetesApiClient.getOptimizedItem("pods", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem From 694bbc0c5f98efd7c02a2bf37b76d84f4be0a2d1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 8 Jan 2022 21:58:46 -0800 Subject: [PATCH 10/65] further optimize --- source/plugins/ruby/KubernetesApiClient.rb | 55 +++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 348d2e7ba..1bfa780d9 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -939,17 +939,21 @@ def getPodOptimizedItem(resourceItem) item["metadata"] = {} if !resourceItem["metadata"].nil? if !resourceItem["metadata"]["annotations"].nil? - item["metadata"]["annotations"] = resourceItem["metadata"]["annotations"] + item["metadata"]["annotations"] = {} + item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"] end if !resourceItem["metadata"]["labels"].nil? item["metadata"]["labels"] = resourceItem["metadata"]["labels"] end - if !resourceItem["metadata"]["ownerReferences"].nil? - item["metadata"]["ownerReferences"] = resourceItem["metadata"]["ownerReferences"] + if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0 + item["metadata"]["ownerReferences"] = [] + ownerReference = {} + ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"] + ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"] + item["metadata"]["ownerReferences"].push(ownerReference) end item["metadata"]["name"] = resourceItem["metadata"]["name"] item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] - item["metadata"]["resourceVersion"] = resourceItem["metadata"]["resourceVersion"] item["metadata"]["uid"] = resourceItem["metadata"]["uid"] item["metadata"]["creationTimestamp"] = resourceItem["metadata"]["creationTimestamp"] if !resourceItem["metadata"]["deletionTimestamp"].nil? @@ -1021,14 +1025,12 @@ def getPodOptimizedItem(resourceItem) if !resourceItem["status"]["phase"].nil? item["status"]["phase"] = resourceItem["status"]["phase"] end - item["status"]["conditions"] = [] if !resourceItem["status"]["conditions"].nil? + item["status"]["conditions"] = [] resourceItem["status"]["conditions"].each do |condition| currentCondition = {} currentCondition["type"] = condition["type"] currentCondition["status"] = condition["status"] - ## TODO - check if we need this - currentCondition["lastTransitionTime"] = condition["lastTransitionTime"] item["status"]["conditions"].push(currentCondition) end end @@ -1083,12 +1085,25 @@ def getNodeOptimizedItem(resourceItem) item["spec"] = {} if !resourceItem["spec"].nil? if !resourceItem["spec"]["providerID"].nil? && !resourceItem["spec"]["providerID"].empty? - item["spec"]["providerID"] = resourceItem["spec"]["providerID"] + provider = resourceItem["spec"]["providerID"].split(":")[0] + if !provider.nil? && !provider.empty? + item["spec"]["providerID"] = provider + end end end item["status"] = {} if !resourceItem["status"].nil? - item["status"]["conditions"] = resourceItem["status"]["conditions"] + item["status"]["conditions"] = [] + if !resourceItem["status"]["conditions"].nil? + resourceItem["status"]["conditions"].each do |condition| + currentCondition = {} + currentCondition["type"] = condition["type"] + currentCondition["status"] = condition["status"] + currentCondition["lastTransitionTime"] = condition["lastTransitionTime"] + item["status"]["conditions"].push(currentCondition) + end + end + item["status"]["nodeInfo"] = {} nodeInfo = {} if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty? @@ -1100,8 +1115,26 @@ def getNodeOptimizedItem(resourceItem) nodeInfo["kernelVersion"] = resourceItem["status"]["nodeInfo"]["kernelVersion"] end item["status"]["nodeInfo"] = nodeInfo - item["status"]["allocatable"] = resourceItem["status"]["allocatable"] - item["status"]["capacity"] = resourceItem["status"]["capacity"] + + item["status"]["allocatable"] = {} + nodeAllocatable = {} + if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty? + nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"] + nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"] + nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] + nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + end + item["status"]["allocatable"] = nodeAllocatable + + item["status"]["capacity"] = {} + nodeCapacity = {} + if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty? + nodeCapacity["cpu"] = resourceItem["status"]["allocatable"]["cpu"] + nodeCapacity["memory"] = resourceItem["status"]["allocatable"]["memory"] + nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] + nodeCapacity["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + end + item["status"]["capacity"] = nodeCapacity end rescue => errorStr @Log.warn "KubernetesApiClient::getNodeOptimizedItem:Failed with an error : #{errorStr}" From ac88379590f7018a48be29f7bafe61838f26c984 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 9 Jan 2022 19:35:15 -0800 Subject: [PATCH 11/65] bugfix related typo --- source/plugins/ruby/KubernetesApiClient.rb | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 1bfa780d9..594735eee 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -1104,7 +1104,6 @@ def getNodeOptimizedItem(resourceItem) end end - item["status"]["nodeInfo"] = {} nodeInfo = {} if !resourceItem["status"]["nodeInfo"].nil? && !resourceItem["status"]["nodeInfo"].empty? nodeInfo["kubeletVersion"] = resourceItem["status"]["nodeInfo"]["kubeletVersion"] @@ -1116,7 +1115,6 @@ def getNodeOptimizedItem(resourceItem) end item["status"]["nodeInfo"] = nodeInfo - item["status"]["allocatable"] = {} nodeAllocatable = {} if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty? nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"] @@ -1126,13 +1124,12 @@ def getNodeOptimizedItem(resourceItem) end item["status"]["allocatable"] = nodeAllocatable - item["status"]["capacity"] = {} nodeCapacity = {} if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty? - nodeCapacity["cpu"] = resourceItem["status"]["allocatable"]["cpu"] - nodeCapacity["memory"] = resourceItem["status"]["allocatable"]["memory"] - nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] - nodeCapacity["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"] + nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"] + nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"] + nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"] end item["status"]["capacity"] = nodeCapacity end From 5835f4284f007a21e8202d8d1a793634c016d78f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 10 Jan 2022 19:14:16 -0800 Subject: [PATCH 12/65] node allocatable cache --- source/plugins/ruby/KubernetesApiClient.rb | 76 ++++--- source/plugins/ruby/in_kube_nodes.rb | 10 +- source/plugins/ruby/in_kube_podinventory.rb | 207 ++++++++++++++++++-- 3 files changed, 245 insertions(+), 48 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 594735eee..194388d9f 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -37,7 +37,6 @@ class KubernetesApiClient @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil - @@NodeMetrics = Hash.new @@WinNodeArray = [] @@telemetryTimeTracker = DateTime.now.to_time.to_i @@resourceLimitsTelemetryHash = {} @@ -411,7 +410,7 @@ def getPodUid(podNameSpace, podMetadata) return podUid end - def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId @@ -466,11 +465,8 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricItems.push(metricProps) #No container level limit for the given metric, so default to node level limit else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - + if (metricCategory == "limits" && !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect)) + metricValue = nodeAllocatableRecord[metricNameToCollect] metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -498,7 +494,7 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle return metricItems end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, nodeAllocatableRecord, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId @@ -543,8 +539,9 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, else #No container level limit for the given metric, so default to node level limit for non-gpu metrics if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - metricValue = @@NodeMetrics[nodeMetricsHashKey] + if !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect) + metricValue = nodeAllocatableRecord[metricNameToCollect] + end end end if (!metricValue.nil?) @@ -621,11 +618,6 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri metricItem["json_Collections"] = [] metricItem["json_Collections"] = metricCollections.to_json - - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") end rescue => error @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") @@ -659,13 +651,6 @@ def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect metricItem["Tags"] = metricTags - - #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end end rescue => error @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") @@ -917,6 +902,22 @@ def getServiceOptimizedItem(resourceItem) return item end + def isWindowsNodeItem(nodeResourceItem) + isWindowsNodeItem = false + begin + nodeStatus = nodeResourceItem["status"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + isWindowsNodeItem = true + end + end + rescue => errorStr + $Log.warn "KubernetesApiClient::::isWindowsNodeItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" + end + return isWindowsNodeItem + end + def isWindowsPodItem(podItem) isWindowsPod = false begin @@ -1071,6 +1072,21 @@ def getPodOptimizedItem(resourceItem) return item end + def getNodeAllocatableValues(nodeResourceItem) + nodeAllocatable = {} + begin + if !nodeResourceItem["status"].nil? && + !nodeResourceItem["status"]["allocatable"].nil? && + !nodeResourceItem["status"]["allocatable"].empty? + nodeAllocatable["cpu"] = nodeResourceItem["status"]["allocatable"]["cpu"] + nodeAllocatable["memory"] = nodeResourceItem["status"]["allocatable"]["memory"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getNodeAllocatableValues:Failed with an error : #{errorStr}" + end + return nodeAllocatable + end + def getNodeOptimizedItem(resourceItem) item = {} begin @@ -1119,8 +1135,12 @@ def getNodeOptimizedItem(resourceItem) if !resourceItem["status"]["allocatable"].nil? && !resourceItem["status"]["allocatable"].empty? nodeAllocatable["cpu"] = resourceItem["status"]["allocatable"]["cpu"] nodeAllocatable["memory"] = resourceItem["status"]["allocatable"]["memory"] - nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] - nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + if !resourceItem["status"]["allocatable"]["nvidia.com/gpu"].nil? + nodeAllocatable["nvidia.com/gpu"] = resourceItem["status"]["allocatable"]["nvidia.com/gpu"] + end + if !resourceItem["status"]["allocatable"]["amd.com/gpu"].nil? + nodeAllocatable["amd.com/gpu"] = resourceItem["status"]["allocatable"]["amd.com/gpu"] + end end item["status"]["allocatable"] = nodeAllocatable @@ -1128,8 +1148,12 @@ def getNodeOptimizedItem(resourceItem) if !resourceItem["status"]["capacity"].nil? && !resourceItem["status"]["capacity"].empty? nodeCapacity["cpu"] = resourceItem["status"]["capacity"]["cpu"] nodeCapacity["memory"] = resourceItem["status"]["capacity"]["memory"] - nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"] - nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"] + if !resourceItem["status"]["capacity"]["nvidia.com/gpu"].nil? + nodeCapacity["nvidia.com/gpu"] = resourceItem["status"]["capacity"]["nvidia.com/gpu"] + end + if !resourceItem["status"]["capacity"]["amd.com/gpu"].nil? + nodeCapacity["amd.com/gpu"] = resourceItem["status"]["capacity"]["amd.com/gpu"] + end end item["status"]["capacity"] = nodeCapacity end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 997167780..a7d9a8f6d 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -207,9 +207,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream - $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -222,7 +222,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -271,7 +271,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -301,7 +301,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 4103fcd33..acff8a591 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -33,6 +33,7 @@ def initialize # this configurable via configmap @PODS_CHUNK_SIZE = 0 @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @NODES_CHUNK_SIZE = 0 @podCount = 0 @serviceCount = 0 @@ -47,6 +48,10 @@ def initialize @watchServicesThread = nil @serviceItemsCache = {} + @watchNodesThread = nil + @nodeAllocatableCache = {} + @windowsNodeCache = {} + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" @@ -81,14 +86,27 @@ def start @PODS_EMIT_STREAM_BATCH_SIZE = 200 end $log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 + end + $log.info("in_kube_podinventory::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @podCacheMutex = Mutex.new @serviceCacheMutex = Mutex.new + @nodeAllocatableCacheMutex = Mutex.new + # @windowsNodeCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @watchPodsThread = Thread.new(&method(:watch_pods)) @watchServicesThread = Thread.new(&method(:watch_services)) + @watchNodesThread = Thread.new(&method(:watch_nodes)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -101,6 +119,8 @@ def shutdown } @thread.join @watchPodsThread.join + @watchServicesThread.join + @watchNodesThread.join super # This super must be at the end of shutdown method end end @@ -156,6 +176,15 @@ def enumerate(podList = nil) @serviceCount = serviceRecords.length $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}") + nodeAllocatableRecords = {} + nodeAllocatableCacheSizeKB = 0 + @nodeAllocatableCacheMutex.synchronize { + nodeAllocatableRecords = @nodeAllocatableCache.clone + } + if KubernetesApiClient.isEmitCacheTelemetry() + nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024 + end + $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") # to track e2e processing latency @podsAPIE2ELatencyMs = 0 podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i @@ -173,7 +202,7 @@ def enumerate(podList = nil) @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -181,6 +210,7 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil + nodeAllocatableRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -198,6 +228,7 @@ def enumerate(podList = nil) if KubernetesApiClient.isEmitCacheTelemetry() telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB + telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) @@ -219,7 +250,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 @@ -243,11 +274,11 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end # Setting this flag to true so that we can send ContainerInventory records for containers # on windows nodes and parse environment variables for these containers + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end if winNodes.length > 0 - nodeName = "" - if !item["spec"]["nodeName"].nil? - nodeName = item["spec"]["nodeName"] - end if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel @@ -263,7 +294,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -271,19 +302,23 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc eventStream = Fluent::MultiEventStream.new end + nodeAllocatableRecord = {} + if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) + nodeAllocatableRecord = nodeAllocatableRecords[nodeName] + end #container perf records containerMetricDataItems = [] - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) containerMetricDataItems.each do |record| kubePerfEventStream.add(emitTime, record) if record end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -293,16 +328,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc # container GPU records containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -924,5 +959,143 @@ def watch_services end $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}") end + + def watch_nodes + $log.info("in_kube_podinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + # clear node limits cache before filling the cache with list + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache.clear() + } + # @windowsNodeCacheMutex.synchronize { + # @windowsNodeCache.clear() + # } + continuationToken = nil + $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + $log.info("in_kube_podinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + end + end + end + else + $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + end + end + end + else + $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory" + end + end + end + begin + $log.info("in_kube_podinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_podinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + end + else + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache.delete(key) + } + end + end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + rescue => errorStr + $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + end + end + $log.info("in_kube_podinventory::watch_nodes:End @ #{Time.now.utc.iso8601}") + end end # Kube_Pod_Input end # module From 12f9754bc98261eae3b7122032bb68866f09ca64 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 12 Jan 2022 12:22:40 -0800 Subject: [PATCH 13/65] wincontainerinventory in multiproc --- kubernetes/omsagent.yaml | 8 +- source/plugins/ruby/KubernetesApiClient.rb | 87 +++++++++------ source/plugins/ruby/in_kube_podinventory.rb | 116 ++++++++++++++++---- 3 files changed, 152 insertions(+), 59 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index d5545f041..1a4caf7dd 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022" imagePullPolicy: IfNotPresent resources: limits: @@ -456,7 +456,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode # - name: omsagent-prometheus - # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + # image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022" # imagePullPolicy: IfNotPresent # resources: # limits: @@ -605,7 +605,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cirshs01122022" imagePullPolicy: IfNotPresent resources: limits: @@ -615,6 +615,8 @@ spec: cpu: 150m memory: 250Mi env: + - name: EMIT_CACHE_TELEMETRY + value: "true" - name: NUM_OF_FLUENTD_WORKERS value: "3" # This value should be same as number of CPU cores specified under limits - name: AKS_RESOURCE_ID diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 194388d9f..1a7444b28 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -37,7 +37,6 @@ class KubernetesApiClient @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil - @@WinNodeArray = [] @@telemetryTimeTracker = DateTime.now.to_time.to_i @@resourceLimitsTelemetryHash = {} @@ -293,8 +292,6 @@ def getWindowsNodes resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" - # Resetting the windows node cache - @@WinNodeArray.clear if (!nodeInventory.empty?) nodeInventory["items"].each do |item| # check for windows operating system in node metadata @@ -304,11 +301,6 @@ def getWindowsNodes if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) - # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data - # to get images and image tags for containers in windows nodes - if !nodeMetadata.nil? && !nodeMetadata["name"].nil? - @@WinNodeArray.push(nodeMetadata["name"]) - end nodeStatusAddresses = nodeStatus["addresses"] if !nodeStatusAddresses.nil? nodeStatusAddresses.each do |address| @@ -328,7 +320,33 @@ def getWindowsNodes end def getWindowsNodesArray - return @@WinNodeArray + winNodeArray = [] + begin + # get only windows nodes + resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") + nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) + @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + winNodeArray.push(nodeMetadata["name"]) + end + end + end + end + end + rescue => error + @Log.warn("KubernetesApiClient::getWindowsNodesArray:failed with an error: #{error}") + end + return winNodeArray end def getContainerIDs(namespace) @@ -856,10 +874,10 @@ def watch(resource_name, options = {}) end end - def getOptimizedItem(resource, resourceItem) + def getOptimizedItem(resource, resourceItem, isWindowsItem = false) case resource when "pods" - return getPodOptimizedItem(resourceItem) + return getPodOptimizedItem(resourceItem, isWindowsItem) when "nodes" return getNodeOptimizedItem(resourceItem) when "services" @@ -918,23 +936,23 @@ def isWindowsNodeItem(nodeResourceItem) return isWindowsNodeItem end - def isWindowsPodItem(podItem) - isWindowsPod = false - begin - winNodes = KubernetesApiClient.getWindowsNodesArray() - if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0 - nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" - if !nodeName.empty? && winNodes.include?(nodeName) - isWindowsPod = true - end - end - rescue => errorStr - $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" - end - return isWindowsPod - end - - def getPodOptimizedItem(resourceItem) + # def isWindowsPodItem(podItem) + # isWindowsPod = false + # begin + # winNodes = KubernetesApiClient.getWindowsNodesArray() + # if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0 + # nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" + # if !nodeName.empty? && winNodes.include?(nodeName) + # isWindowsPod = true + # end + # end + # rescue => errorStr + # $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" + # end + # return isWindowsPod + # end + + def getPodOptimizedItem(resourceItem, isWindowsPodItem) item = {} begin item["metadata"] = {} @@ -961,7 +979,7 @@ def getPodOptimizedItem(resourceItem) item["metadata"]["deletionTimestamp"] = resourceItem["metadata"]["deletionTimestamp"] end end - isWindowsPod = isWindowsPodItem(resourceItem) + item["spec"] = {} if !resourceItem["spec"].nil? item["spec"]["containers"] = [] @@ -976,7 +994,7 @@ def getPodOptimizedItem(resourceItem) currentContainer["name"] = container["name"] currentContainer["resources"] = container["resources"] # fields required for windows containers records - if isWindowsPod + if isWindowsPodItem currentContainer["image"] = container["image"] currentContainer["ports"] = container["ports"] currentContainer["command"] = container["command"] @@ -995,7 +1013,7 @@ def getPodOptimizedItem(resourceItem) currentContainer["name"] = container["name"] currentContainer["resources"] = container["resources"] # fields required for windows containers records - if isWindowsPod + if isWindowsPodItem currentContainer["image"] = container["image"] currentContainer["ports"] = container["ports"] currentContainer["command"] = container["command"] @@ -1059,12 +1077,17 @@ def getPodOptimizedItem(resourceItem) currentContainerStatus["restartCount"] = containerStatus["restartCount"] currentContainerStatus["state"] = containerStatus["state"] currentContainerStatus["lastState"] = containerStatus["lastState"] - if isWindowsPod + if isWindowsPodItem currentContainerStatus["imageID"] = containerStatus["imageID"] end item["status"]["containerStatuses"].push(currentContainerStatus) end end + # this metadata used to identify the pod scheduled onto windows node + # so that pod inventory can make decision to extract containerinventory records or not + if isWindowsPodItem + item["isWindows"] = "true" + end end rescue => errorStr @Log.warn "KubernetesApiClient::getPodOptimizedItem:Failed with an error : #{errorStr}" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index acff8a591..2bfc98adb 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -50,7 +50,8 @@ def initialize @watchNodesThread = nil @nodeAllocatableCache = {} - @windowsNodeCache = {} + @windowsNodeNameListCache = [] + @windowsContainerRecordsCacheSizeBytes = 0 @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @@ -102,11 +103,11 @@ def start @podCacheMutex = Mutex.new @serviceCacheMutex = Mutex.new @nodeAllocatableCacheMutex = Mutex.new - # @windowsNodeCacheMutex = Mutex.new + @windowsNodeNameCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @watchNodesThread = Thread.new(&method(:watch_nodes)) @watchPodsThread = Thread.new(&method(:watch_pods)) @watchServicesThread = Thread.new(&method(:watch_services)) - @watchNodesThread = Thread.new(&method(:watch_nodes)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -133,6 +134,7 @@ def enumerate(podList = nil) @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @windowsContainerRecordsCacheSizeBytes = 0 @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -229,6 +231,7 @@ def enumerate(podList = nil) telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB + telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024 end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) @@ -261,8 +264,8 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, @@istestvar = ENV["ISTEST"] begin #begin block start - # Getting windows nodes from kubeapi - winNodes = KubernetesApiClient.getWindowsNodesArray + # # Getting windows nodes from kubeapi + # winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |item| #podInventory block start # pod inventory records podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) @@ -278,17 +281,18 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, if !item["spec"]["nodeName"].nil? nodeName = item["spec"]["nodeName"] end - if winNodes.length > 0 - if (!nodeName.empty? && (winNodes.include? nodeName)) - clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] - #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) - # Send container inventory records for containers on windows nodes - @winContainerCount += containerInventoryRecords.length - containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - containerInventoryStream.add(emitTime, cirecord) if cirecord - end + if (!item["isWindows"].nil? && !item["isWindows"].empty? && item["isWindows"].downcase == "true") + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel + containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) + if KubernetesApiClient.isEmitCacheTelemetry() + @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length + end + # Send container inventory records for containers on windows nodes + @winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + containerInventoryStream.add(emitTime, cirecord) if cirecord end end end @@ -711,8 +715,13 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) def watch_pods $log.info("in_kube_podinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") podsResourceVersion = nil - # invoke getWindowsNodes to get windowsnodearray cache populated - KubernetesApiClient.getWindowsNodes() + # invoke getWindowsNodes to handle scenario where windowsNodeNameCache not populated yet on containerstart + winNodes = KubernetesApiClient.getWindowsNodesArray() + if winNodes.length > 0 + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache = winNodes.dup + } + end loop do begin if podsResourceVersion.nil? @@ -720,6 +729,10 @@ def watch_pods @podCacheMutex.synchronize { @podItemsCache.clear() } + currentWindowsNodeNameList = [] + @windowsNodeNameCacheMutex.synchronize { + currentWindowsNodeNameList = @windowsNodeNameListCache.dup + } continuationToken = nil $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") @@ -731,7 +744,12 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + isWindowsPodItem = false + if !nodeName.empty? && !currentWindowsNodeNameList.nil? && !currentWindowsNodeNameList.empty? && currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -756,7 +774,15 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + isWindowsPodItem = false + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -799,7 +825,19 @@ def watch_pods if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + currentWindowsNodeNameList = [] + @windowsNodeNameCacheMutex.synchronize { + currentWindowsNodeNameList = @windowsNodeNameListCache.dup + } + isWindowsPodItem = false + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -970,9 +1008,9 @@ def watch_nodes @nodeAllocatableCacheMutex.synchronize { @nodeAllocatableCache.clear() } - # @windowsNodeCacheMutex.synchronize { - # @windowsNodeCache.clear() - # } + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.clear() + } continuationToken = nil $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") @@ -986,6 +1024,14 @@ def watch_nodes key = item["metadata"]["name"] if !key.nil? && !key.empty? nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) + if isWindowsNodeItem + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + end if !nodeAllocatable.nil? && !nodeAllocatable.empty? @nodeAllocatableCacheMutex.synchronize { @nodeAllocatableCache[key] = nodeAllocatable @@ -1011,6 +1057,14 @@ def watch_nodes key = item["metadata"]["name"] if !key.nil? && !key.empty? nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) + if isWindowsNodeItem + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + end if !nodeAllocatable.nil? && !nodeAllocatable.empty? @nodeAllocatableCacheMutex.synchronize { @nodeAllocatableCache[key] = nodeAllocatable @@ -1052,6 +1106,14 @@ def watch_nodes end if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) key = item["metadata"]["name"] + isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) + if isWindowsNodeItem + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + end if !key.nil? && !key.empty? nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) if !nodeAllocatable.nil? && !nodeAllocatable.empty? @@ -1066,6 +1128,12 @@ def watch_nodes end elsif notice["type"] == "DELETED" key = item["metadata"]["name"] + isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) + if isWindowsNodeItem + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.delete(key) + } + end if !key.nil? && !key.empty? @nodeAllocatableCacheMutex.synchronize { @nodeAllocatableCache.delete(key) From 5da266fdbd1cff63588081ad068c4e3ee4ac7985 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 14 Jan 2022 18:42:22 -0800 Subject: [PATCH 14/65] disable health --- build/linux/installer/conf/kube.conf | 80 ++++++++++++++-------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 10a271d99..0a01d63f2 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -231,12 +231,12 @@ #fluent forward plugin - - @type forward - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - bind 0.0.0.0 - chunk_size_limit 4m - + # + # @type forward + # port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + # bind 0.0.0.0 + # chunk_size_limit 4m + # #Kubernetes Persistent Volume inventory @@ -254,13 +254,13 @@ @log_level debug - #Kubernetes health - - @type kube_health - tag kubehealth.ReplicaSet - run_interval 60 - @log_level debug - + # #Kubernetes health + # + # @type kube_health + # tag kubehealth.ReplicaSet + # run_interval 60 + # @log_level debug + # #cadvisor perf- Windows nodes @@ -287,9 +287,9 @@ #health model aggregation filter - - @type health_model_builder - + # + # @type health_model_builder + # #kubepvinventory @@ -371,30 +371,30 @@ #kubehealth - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - - - @type file - path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - keepalive true - + # + # @type forward + # @log_level debug + # send_timeout 30 + # connect_timeout 30 + # heartbeat_type none + # + # host 0.0.0.0 + # port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + # + # + # @type file + # path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer + # overflow_action drop_oldest_chunk + # chunk_limit_size 4m + # queue_limit_length 20 + # flush_interval 20s + # retry_max_times 10 + # retry_wait 5s + # retry_max_interval 5m + # flush_thread_count 5 + # + # keepalive true + # @type mdm From 658a4403bde92f854a01dba81a3059dcddeaa2df Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 16 Jan 2022 20:44:08 -0800 Subject: [PATCH 15/65] config events on different core --- build/linux/installer/conf/kube.conf | 69 ++++++++++++++-------------- kubernetes/linux/main.sh | 12 +++++ kubernetes/omsagent.yaml | 6 +-- 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 0a01d63f2..28a07c223 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -229,6 +229,41 @@ retry_mdm_post_wait_minutes 30 + + #Kubernetes events + + @type kube_events + tag oneagent.containerInsights.KUBE_EVENTS_BLOB + run_interval 60 + @log_level debug + + + #kubeevents + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + #fluent forward plugin # @@ -246,14 +281,6 @@ @log_level debug - #Kubernetes events - - @type kube_events - tag oneagent.containerInsights.KUBE_EVENTS_BLOB - run_interval 60 - @log_level debug - - # #Kubernetes health # # @type kube_health @@ -344,32 +371,6 @@ keepalive true - #kubeevents - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - - - @type file - path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - keepalive true - - #kubehealth # # @type forward diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 023cc11e4..00301a969 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -588,16 +588,25 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else case $NUM_OF_FLUENTD_WORKERS in + 4) + export NUM_OF_FLUENTD_WORKERS=4 + export FLUENTD_POD_INVENTORY_WORKER_ID=3 + export FLUENTD_NODE_INVENTORY_WORKER_ID=2 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; 3) export NUM_OF_FLUENTD_WORKERS=3 export FLUENTD_POD_INVENTORY_WORKER_ID=2 export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; 2) export NUM_OF_FLUENTD_WORKERS=2 export FLUENTD_POD_INVENTORY_WORKER_ID=1 export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; @@ -605,12 +614,14 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then export NUM_OF_FLUENTD_WORKERS=1 export FLUENTD_POD_INVENTORY_WORKER_ID=0 export FLUENTD_NODE_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; esac echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc source ~/.bashrc @@ -618,6 +629,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "num of workers:${NUM_OF_FLUENTD_WORKERS}" echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" + echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "*** starting fluentd v1 in replicaset" diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 1a4caf7dd..8a4532035 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -609,8 +609,8 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 3 - memory: 1.5Gi + cpu: 4 + memory: 2Gi requests: cpu: 150m memory: 250Mi @@ -618,7 +618,7 @@ spec: - name: EMIT_CACHE_TELEMETRY value: "true" - name: NUM_OF_FLUENTD_WORKERS - value: "3" # This value should be same as number of CPU cores specified under limits + value: "4" # This value should be same as number of CPU cores specified under limits - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION From b8b8d181530add37e908d45ce5906b1b1e856c24 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 17 Jan 2022 10:44:43 -0800 Subject: [PATCH 16/65] add ts to logs --- source/plugins/ruby/in_kube_nodes.rb | 16 ++++----- source/plugins/ruby/in_kube_podinventory.rb | 40 ++++++++++----------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a7d9a8f6d..3746dc224 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -599,15 +599,15 @@ def watch_nodes @nodeItemsCache[key] = nodeItem } else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory" + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") @@ -624,15 +624,15 @@ def watch_nodes @nodeItemsCache[key] = nodeItem } else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory" + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end end end @@ -667,10 +667,10 @@ def watch_nodes @nodeItemsCache[key] = nodeItem } else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty" + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" end elsif notice["type"] == "DELETED" key = item["metadata"]["uid"] diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 2bfc98adb..36d390cb4 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -755,10 +755,10 @@ def watch_pods @podItemsCache[key] = podItem } else - $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil" + $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" end end end @@ -788,15 +788,15 @@ def watch_pods @podItemsCache[key] = podItem } else - $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil" + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" end end end @@ -843,10 +843,10 @@ def watch_pods @podItemsCache[key] = podItem } else - $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil" + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty" + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" end elsif notice["type"] == "DELETED" key = item["metadata"]["uid"] @@ -915,15 +915,15 @@ def watch_services @serviceItemsCache[key] = serviceItem } else - $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty" + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty" + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory" + $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory @ #{Time.now.utc.iso8601}" end serviceInventory = nil end @@ -959,10 +959,10 @@ def watch_services @serviceItemsCache[key] = serviceItem } else - $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty" + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty" + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" end elsif notice["type"] == "DELETED" key = item["metadata"]["uid"] @@ -1037,15 +1037,15 @@ def watch_nodes @nodeAllocatableCache[key] = nodeAllocatable } else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory" + $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") @@ -1070,15 +1070,15 @@ def watch_nodes @nodeAllocatableCache[key] = nodeAllocatable } else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory" + $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end end end @@ -1121,10 +1121,10 @@ def watch_nodes @nodeAllocatableCache[key] = nodeAllocatable } else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty" + $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end elsif notice["type"] == "DELETED" key = item["metadata"]["name"] From 6cf9c1137b5f32207d253ba50ed79de5383f6678 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 17 Jan 2022 21:54:09 -0800 Subject: [PATCH 17/65] move kube perf records to separate plugin --- build/linux/installer/conf/kube.conf | 8 + .../installer/datafiles/base_container.data | 1 + kubernetes/linux/main.sh | 19 +- source/plugins/ruby/in_kube_perfinventory.rb | 538 ++++++++++++++++++ source/plugins/ruby/in_kube_podinventory.rb | 232 ++++---- 5 files changed, 681 insertions(+), 117 deletions(-) create mode 100644 source/plugins/ruby/in_kube_perfinventory.rb diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 28a07c223..6f4d91fe6 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -273,6 +273,14 @@ # chunk_size_limit 4m # + #Kubernetes perf inventory + + @type kube_perfinventory + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + #Kubernetes Persistent Volume inventory @type kube_pvinventory diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index a405e760f..0268499dc 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -162,6 +162,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root /etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root /etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_perfinventory.rb; source/plugins/ruby/in_kube_perfinventory.rb; 644; root; root /etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root /etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root /etc/fluent/plugin/in_kube_health.rb; source/plugins/ruby/in_kube_health.rb; 644; root; root diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 00301a969..80da23d23 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -570,9 +570,26 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & else - echo "starting mdsd mode in main container..." + echo "starting mdsd in main container..." # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null & + + ## TODO- evaluate again multiplace instances of mdsd + # echo "starting mdsd tenant instance 2 in main container..." + # echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in main container..." + # #use tenant name to avoid unix socket conflict and different ports for port conflict + # #roleprefix to use container specific mdsd socket + # MDSD_INSTANCE_ID="tenant2" + # export TENANT_NAME="${MDSD_INSTANCE_ID}" + # echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc + # export MDSD_ROLE_PREFIX=/var/run/mdsd-${TENANT_NAME}/default + # echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc + # export MDSD_FLUENT_SOCKET_PORT_TENANT2="26230" + # echo "export MDSD_FLUENT_SOCKET_PORT_TENANT2=$MDSD_FLUENT_SOCKET_PORT_TENANT2" >>~/.bashrc + # source ~/.bashrc + # mkdir /var/run/mdsd-${MDSD_INSTANCE_ID} + # # add -T 0xFFFF for full traces + # mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd2.err -w ${MDSD_LOG}/mdsd2.warn -o ${MDSD_LOG}/mdsd2.info -q ${MDSD_LOG}/mdsd2.qos & fi # Set up a cron job for logrotation diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb new file mode 100644 index 000000000..888f0db76 --- /dev/null +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -0,0 +1,538 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "fluent/plugin/input" + +module Fluent::Plugin + class Kube_PerfInventory_Input < Input + Fluent::Plugin.register_input("kube_perfinventory", self) + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "set" + require "time" + require "net/http" + + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + require_relative "extension_utils" + + # refer tomlparser-agent-config for updating defaults + # this configurable via configmap + @PODS_CHUNK_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @NODES_CHUNK_SIZE = 0 + + @watchPodsThread = nil + @podItemsCache = {} + + @watchNodesThread = nil + @nodeAllocatableCache = {} + + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" + + def configure(conf) + super + end + + def start + if @run_interval + super + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") + @PODS_CHUNK_SIZE = 1000 + end + $log.info("in_kube_perfinventory::start: PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_perfinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") + @PODS_EMIT_STREAM_BATCH_SIZE = 200 + end + $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_perfinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 + end + $log.info("in_kube_perfinventory::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @podCacheMutex = Mutex.new + @nodeAllocatableCacheMutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @watchNodesThread = Thread.new(&method(:watch_nodes)) + @watchPodsThread = Thread.new(&method(:watch_pods)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + @watchPodsThread.join + @watchNodesThread.join + super # This super must be at the end of shutdown method + end + end + + def enumerate(podList = nil) + begin + podInventory = podList + @podCount = 0 + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_perfinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_kube_perfinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + end + + nodeAllocatableRecords = {} + nodeAllocatableCacheSizeKB = 0 + @nodeAllocatableCacheMutex.synchronize { + nodeAllocatableRecords = @nodeAllocatableCache.clone + } + $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") + # Initializing continuation token to nil + continuationToken = nil + podItemsCacheSizeKB = 0 + podInventory = {} + @podCacheMutex.synchronize { + podInventory["items"] = @podItemsCache.values.clone + } + if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime) + else + $log.warn "in_kube_perfinventory::enumerate:Received empty podInventory" + end + # Setting these to nil so that we dont hold memory until GC kicks in + podInventory = nil + nodeAllocatableRecords = nil + rescue => errorStr + $log.warn "in_kube_perfinventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = Fluent::Engine.now + kubePerfEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new + @@istestvar = ENV["ISTEST"] + + begin #begin block start + # # Getting windows nodes from kubeapi + # winNodes = KubernetesApiClient.getWindowsNodesArray + podInventory["items"].each do |item| #podInventory block start + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + + nodeAllocatableRecord = {} + if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) + nodeAllocatableRecord = nodeAllocatableRecords[nodeName] + end + #container perf records + containerMetricDataItems = [] + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) + + containerMetricDataItems.each do |record| + kubePerfEventStream.add(emitTime, record) if record + end + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_perfinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + kubePerfEventStream = Fluent::MultiEventStream.new + end + + # container GPU records + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord + end + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_perfinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new + end + end #podInventory block end + + if kubePerfEventStream.count > 0 + $log.info("in_kube_perfinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_perfinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + insightsMetricsEventStream = nil + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record kube perf inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end #begin block end + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_perfinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_perfinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_perfinventory::run_periodic: enumerate Failed to retrieve perf inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + def watch_pods + $log.info("in_kube_perfinventory::watch_pods:Start @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + loop do + begin + if podsResourceVersion.nil? + # clear cache before filling the cache with list + @podCacheMutex.synchronize { + @podItemsCache.clear() + } + currentWindowsNodeNameList = [] + continuationToken = nil + $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") + $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + end + end + end + begin + $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + @podCacheMutex.synchronize { + @podItemsCache.delete(key) + } + end + end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + $log.info("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + end + end + $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}") + end + + def watch_nodes + $log.info("in_kube_perfinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + # clear node limits cache before filling the cache with list + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache.clear() + } + continuationToken = nil + $log.info("in_kube_perfinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + $log.info("in_kube_perfinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + end + end + begin + $log.info("in_kube_perfinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_perfinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + $log.info("in_kube_perfinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_perfinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache[key] = nodeAllocatable + } + else + $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + @nodeAllocatableCacheMutex.synchronize { + @nodeAllocatableCache.delete(key) + } + end + end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_perfinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + end + end + end + rescue Net::ReadTimeout => errorStr + $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher + end + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + end + end + $log.info("in_kube_perfinventory::watch_nodes:End @ #{Time.now.utc.iso8601}") + end + end # Kube_Pod_Input +end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 36d390cb4..bdf0a7f35 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -49,14 +49,14 @@ def initialize @serviceItemsCache = {} @watchNodesThread = nil - @nodeAllocatableCache = {} + # @nodeAllocatableCache = {} @windowsNodeNameListCache = [] @windowsContainerRecordsCacheSizeBytes = 0 - @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + # @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + # @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -102,7 +102,7 @@ def start @mutex = Mutex.new @podCacheMutex = Mutex.new @serviceCacheMutex = Mutex.new - @nodeAllocatableCacheMutex = Mutex.new + # @nodeAllocatableCacheMutex = Mutex.new @windowsNodeNameCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @watchNodesThread = Thread.new(&method(:watch_nodes)) @@ -143,25 +143,25 @@ def enumerate(podList = nil) podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end + # if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + # @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + # end if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) end if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) end - if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end + # if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + # @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + # end if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) end - $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end @@ -178,15 +178,15 @@ def enumerate(podList = nil) @serviceCount = serviceRecords.length $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}") - nodeAllocatableRecords = {} - nodeAllocatableCacheSizeKB = 0 - @nodeAllocatableCacheMutex.synchronize { - nodeAllocatableRecords = @nodeAllocatableCache.clone - } - if KubernetesApiClient.isEmitCacheTelemetry() - nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024 - end - $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") + # nodeAllocatableRecords = {} + # nodeAllocatableCacheSizeKB = 0 + # @nodeAllocatableCacheMutex.synchronize { + # nodeAllocatableRecords = @nodeAllocatableCache.clone + # } + # if KubernetesApiClient.isEmitCacheTelemetry() + # nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024 + # end + # $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") # to track e2e processing latency @podsAPIE2ELatencyMs = 0 podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i @@ -204,7 +204,7 @@ def enumerate(podList = nil) @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -212,7 +212,7 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil - nodeAllocatableRecords = nil + # nodeAllocatableRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -230,7 +230,7 @@ def enumerate(podList = nil) if KubernetesApiClient.isEmitCacheTelemetry() telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB - telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB + # telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024 end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) @@ -253,7 +253,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 @@ -306,48 +306,48 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, eventStream = Fluent::MultiEventStream.new end - nodeAllocatableRecord = {} - if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) - nodeAllocatableRecord = nodeAllocatableRecords[nodeName] - end - #container perf records - containerMetricDataItems = [] - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) + # nodeAllocatableRecord = {} + # if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) + # nodeAllocatableRecord = nodeAllocatableRecords[nodeName] + # end + # #container perf records + # containerMetricDataItems = [] + # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) + # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) + # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) + # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) - containerMetricDataItems.each do |record| - kubePerfEventStream.add(emitTime, record) if record - end + # containerMetricDataItems.each do |record| + # kubePerfEventStream.add(emitTime, record) if record + # end - if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - kubePerfEventStream = Fluent::MultiEventStream.new - end + # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + # $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + # router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + # $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + # end + # kubePerfEventStream = Fluent::MultiEventStream.new + # end - # container GPU records - containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord - end + # # container GPU records + # containerGPUInsightsMetricsDataItems = [] + # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) + # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) + # containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + # insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord + # end - if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = Fluent::MultiEventStream.new - end + # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + # $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + # $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + # end + # router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + # insightsMetricsEventStream = Fluent::MultiEventStream.new + # end end #podInventory block end if eventStream.count > 0 @@ -368,23 +368,23 @@ def parse_and_emit_records(podInventory, serviceRecords, nodeAllocatableRecords, containerInventoryStream = nil end - if kubePerfEventStream.count > 0 - $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - kubePerfEventStream = nil - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end + # if kubePerfEventStream.count > 0 + # $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + # router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + # kubePerfEventStream = nil + # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + # $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + # end + # end - if insightsMetricsEventStream.count > 0 - $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - insightsMetricsEventStream = nil - end + # if insightsMetricsEventStream.count > 0 + # $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + # router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + # $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + # end + # insightsMetricsEventStream = nil + # end if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" @@ -1004,10 +1004,10 @@ def watch_nodes loop do begin if nodesResourceVersion.nil? - # clear node limits cache before filling the cache with list - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache.clear() - } + # # clear node limits cache before filling the cache with list + # @nodeAllocatableCacheMutex.synchronize { + # @nodeAllocatableCache.clear() + # } @windowsNodeNameCacheMutex.synchronize { @windowsNodeNameListCache.clear() } @@ -1023,7 +1023,7 @@ def watch_nodes nodeInventory["items"].each do |item| key = item["metadata"]["name"] if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) if isWindowsNodeItem @windowsNodeNameCacheMutex.synchronize { @@ -1032,13 +1032,13 @@ def watch_nodes end } end - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end + # if !nodeAllocatable.nil? && !nodeAllocatable.empty? + # @nodeAllocatableCacheMutex.synchronize { + # @nodeAllocatableCache[key] = nodeAllocatable + # } + # else + # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + # end else $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end @@ -1056,7 +1056,7 @@ def watch_nodes nodeInventory["items"].each do |item| key = item["metadata"]["name"] if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) if isWindowsNodeItem @windowsNodeNameCacheMutex.synchronize { @@ -1065,13 +1065,13 @@ def watch_nodes end } end - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end + # if !nodeAllocatable.nil? && !nodeAllocatable.empty? + # @nodeAllocatableCacheMutex.synchronize { + # @nodeAllocatableCache[key] = nodeAllocatable + # } + # else + # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + # end else $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end @@ -1114,18 +1114,18 @@ def watch_nodes end } end - if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - end + # if !key.nil? && !key.empty? + # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + # if !nodeAllocatable.nil? && !nodeAllocatable.empty? + # @nodeAllocatableCacheMutex.synchronize { + # @nodeAllocatableCache[key] = nodeAllocatable + # } + # else + # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + # end + # else + # $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + # end elsif notice["type"] == "DELETED" key = item["metadata"]["name"] isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) @@ -1134,11 +1134,11 @@ def watch_nodes @windowsNodeNameListCache.delete(key) } end - if !key.nil? && !key.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache.delete(key) - } - end + # if !key.nil? && !key.empty? + # @nodeAllocatableCacheMutex.synchronize { + # @nodeAllocatableCache.delete(key) + # } + # end end when "ERROR" nodesResourceVersion = nil From 9f08cb0c696ff47d42f8a3668b8806385c6dd851 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 18 Jan 2022 21:46:39 -0800 Subject: [PATCH 18/65] refactor --- source/plugins/ruby/KubernetesApiClient.rb | 135 +++++++---- source/plugins/ruby/in_kube_perfinventory.rb | 7 +- source/plugins/ruby/in_kube_podinventory.rb | 226 ++++--------------- 3 files changed, 140 insertions(+), 228 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 1a7444b28..f1afd4ac6 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -878,6 +878,8 @@ def getOptimizedItem(resource, resourceItem, isWindowsItem = false) case resource when "pods" return getPodOptimizedItem(resourceItem, isWindowsItem) + when "pods-perf" + return getPodPerfOptimizedItem(resourceItem) when "nodes" return getNodeOptimizedItem(resourceItem) when "services" @@ -936,21 +938,58 @@ def isWindowsNodeItem(nodeResourceItem) return isWindowsNodeItem end - # def isWindowsPodItem(podItem) - # isWindowsPod = false - # begin - # winNodes = KubernetesApiClient.getWindowsNodesArray() - # if !winNodes.nil? && !winNodes.empty? && winNodes.length > 0 - # nodeName = (!podItem["spec"].nil? && !podItem["spec"]["nodeName"].nil?) ? podItem["spec"]["nodeName"] : "" - # if !nodeName.empty? && winNodes.include?(nodeName) - # isWindowsPod = true - # end - # end - # rescue => errorStr - # $Log.warn "KubernetesApiClient::::isWindowsPodItem: failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}" - # end - # return isWindowsPod - # end + def getPodPerfOptimizedItem(resourceItem) + item = {} + begin + item["metadata"] = {} + if !resourceItem["metadata"].nil? + if !resourceItem["metadata"]["annotations"].nil? + item["metadata"]["annotations"] = {} + item["metadata"]["annotations"]["kubernetes.io/config.hash"] = resourceItem["metadata"]["annotations"]["kubernetes.io/config.hash"] + end + + if !resourceItem["metadata"]["ownerReferences"].nil? && resourceItem["metadata"]["ownerReferences"].length > 0 + item["metadata"]["ownerReferences"] = [] + ownerReference = {} + ownerReference["name"] = resourceItem["metadata"]["ownerReferences"][0]["name"] + ownerReference["kind"] = resourceItem["metadata"]["ownerReferences"][0]["kind"] + item["metadata"]["ownerReferences"].push(ownerReference) + end + item["metadata"]["name"] = resourceItem["metadata"]["name"] + item["metadata"]["namespace"] = resourceItem["metadata"]["namespace"] + item["metadata"]["uid"] = resourceItem["metadata"]["uid"] + end + + item["spec"] = {} + if !resourceItem["spec"].nil? + item["spec"]["containers"] = [] + if !resourceItem["spec"]["containers"].nil? + resourceItem["spec"]["containers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + item["spec"]["containers"].push(currentContainer) + end + end + item["spec"]["initContainers"] = [] + if !resourceItem["spec"]["initContainers"].nil? + resourceItem["spec"]["initContainers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + item["spec"]["initContainers"].push(currentContainer) + end + end + item["spec"]["nodeName"] = "" + if !resourceItem["spec"]["nodeName"].nil? + item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"] + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getPodPerfOptimizedItem:Failed with an error : #{errorStr}" + end + return item + end def getPodOptimizedItem(resourceItem, isWindowsPodItem) item = {} @@ -983,54 +1022,60 @@ def getPodOptimizedItem(resourceItem, isWindowsPodItem) item["spec"] = {} if !resourceItem["spec"].nil? item["spec"]["containers"] = [] + item["spec"]["initContainers"] = [] isDisableClusterCollectEnvVar = false clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 isDisableClusterCollectEnvVar = true end - if !resourceItem["spec"]["containers"].nil? - resourceItem["spec"]["containers"].each do |container| - currentContainer = {} - currentContainer["name"] = container["name"] - currentContainer["resources"] = container["resources"] - # fields required for windows containers records - if isWindowsPodItem - currentContainer["image"] = container["image"] - currentContainer["ports"] = container["ports"] - currentContainer["command"] = container["command"] - currentContainer["env"] = "" - if !isDisableClusterCollectEnvVar - currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + + # container spec required only for windows container inventory records + if isWindowsPodItem + if !resourceItem["spec"]["containers"].nil? + resourceItem["spec"]["containers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPodItem + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["env"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end end + item["spec"]["containers"].push(currentContainer) end - item["spec"]["containers"].push(currentContainer) end - end - item["spec"]["initContainers"] = [] - if !resourceItem["spec"]["initContainers"].nil? - resourceItem["spec"]["initContainers"].each do |container| - currentContainer = {} - currentContainer["name"] = container["name"] - currentContainer["resources"] = container["resources"] - # fields required for windows containers records - if isWindowsPodItem - currentContainer["image"] = container["image"] - currentContainer["ports"] = container["ports"] - currentContainer["command"] = container["command"] - currentContainer["env"] = "" - if !isDisableClusterCollectEnvVar - currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + if !resourceItem["spec"]["initContainers"].nil? + resourceItem["spec"]["initContainers"].each do |container| + currentContainer = {} + currentContainer["name"] = container["name"] + currentContainer["resources"] = container["resources"] + # fields required for windows containers records + if isWindowsPodItem + currentContainer["image"] = container["image"] + currentContainer["ports"] = container["ports"] + currentContainer["command"] = container["command"] + currentContainer["env"] = "" + if !isDisableClusterCollectEnvVar + currentContainer["env"] = KubernetesContainerInventory.obtainContainerEnvironmentVarsFromPodsResponse(resourceItem, container) + end end + item["spec"]["initContainers"].push(currentContainer) end - item["spec"]["initContainers"].push(currentContainer) end end + item["spec"]["nodeName"] = "" if !resourceItem["spec"]["nodeName"].nil? item["spec"]["nodeName"] = resourceItem["spec"]["nodeName"] end end item["status"] = {} + if !resourceItem["status"].nil? if !resourceItem["status"]["startTime"].nil? item["status"]["startTime"] = resourceItem["status"]["startTime"] diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 888f0db76..7403b86f3 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -156,7 +156,6 @@ def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationTok begin #begin block start # # Getting windows nodes from kubeapi - # winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |item| #podInventory block start nodeName = "" if !item["spec"]["nodeName"].nil? @@ -285,7 +284,7 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -310,7 +309,7 @@ def watch_pods podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem @@ -353,7 +352,7 @@ def watch_pods if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) key = item["metadata"]["uid"] if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods", item) + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) if !podItem.nil? && !podItem.empty? @podCacheMutex.synchronize { @podItemsCache[key] = podItem diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bdf0a7f35..d466a7637 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -49,14 +49,11 @@ def initialize @serviceItemsCache = {} @watchNodesThread = nil - # @nodeAllocatableCache = {} @windowsNodeNameListCache = [] @windowsContainerRecordsCacheSizeBytes = 0 - # @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" - # @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -102,10 +99,9 @@ def start @mutex = Mutex.new @podCacheMutex = Mutex.new @serviceCacheMutex = Mutex.new - # @nodeAllocatableCacheMutex = Mutex.new @windowsNodeNameCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @watchNodesThread = Thread.new(&method(:watch_nodes)) + @watchNodesThread = Thread.new(&method(:watch_windows_nodes)) @watchPodsThread = Thread.new(&method(:watch_pods)) @watchServicesThread = Thread.new(&method(:watch_services)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -143,25 +139,18 @@ def enumerate(podList = nil) podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - # if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - # @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - # end if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) end if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) end - # if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - # @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - # end if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) end - # $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - # $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end @@ -178,16 +167,6 @@ def enumerate(podList = nil) @serviceCount = serviceRecords.length $log.info("in_kube_podinventory::enumerate : number of service items :#{@serviceCount} from Kube API @ #{Time.now.utc.iso8601}") - # nodeAllocatableRecords = {} - # nodeAllocatableCacheSizeKB = 0 - # @nodeAllocatableCacheMutex.synchronize { - # nodeAllocatableRecords = @nodeAllocatableCache.clone - # } - # if KubernetesApiClient.isEmitCacheTelemetry() - # nodeAllocatableCacheSizeKB = nodeAllocatableRecords.to_s.length / 1024 - # end - # $log.info("in_kube_podinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") - # to track e2e processing latency @podsAPIE2ELatencyMs = 0 podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil @@ -212,7 +191,6 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil - # nodeAllocatableRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -230,7 +208,6 @@ def enumerate(podList = nil) if KubernetesApiClient.isEmitCacheTelemetry() telemetryProperties["POD_ITEMS_CACHE_SIZE_KB"] = podItemsCacheSizeKB telemetryProperties["SERVICE_ITEMS_CACHE_SIZE_KB"] = serviceItemsCacheSizeKB - # telemetryProperties["NODE_ALLOCATABLE_ITEMS_CACHE_SIZE_KB"] = nodeAllocatableCacheSizeKB telemetryProperties["WINDOWS_CONTAINER_RECORDS_CACHE_SIZE_KB"] = @windowsContainerRecordsCacheSizeBytes / 1024 end ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) @@ -264,8 +241,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc @@istestvar = ENV["ISTEST"] begin #begin block start - # # Getting windows nodes from kubeapi - # winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |item| #podInventory block start # pod inventory records podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) @@ -305,49 +280,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc router.emit_stream(@tag, eventStream) if eventStream eventStream = Fluent::MultiEventStream.new end - - # nodeAllocatableRecord = {} - # if !nodeName.empty? && !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? && nodeAllocatableRecords.has_key?(nodeName) - # nodeAllocatableRecord = nodeAllocatableRecords[nodeName] - # end - # #container perf records - # containerMetricDataItems = [] - # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", nodeAllocatableRecord, batchTime)) - # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", nodeAllocatableRecord, batchTime)) - # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", nodeAllocatableRecord, batchTime)) - # containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", nodeAllocatableRecord, batchTime)) - - # containerMetricDataItems.each do |record| - # kubePerfEventStream.add(emitTime, record) if record - # end - - # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - # $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - # router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - # $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - # end - # kubePerfEventStream = Fluent::MultiEventStream.new - # end - - # # container GPU records - # containerGPUInsightsMetricsDataItems = [] - # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) - # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) - # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", nodeAllocatableRecord, batchTime)) - # containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", nodeAllocatableRecord, batchTime)) - # containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - # insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord - # end - - # if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - # $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - # $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - # end - # router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - # insightsMetricsEventStream = Fluent::MultiEventStream.new - # end end #podInventory block end if eventStream.count > 0 @@ -368,24 +300,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerInventoryStream = nil end - # if kubePerfEventStream.count > 0 - # $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - # router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - # kubePerfEventStream = nil - # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - # $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") - # end - # end - - # if insightsMetricsEventStream.count > 0 - # $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - # router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream - # if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - # $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - # end - # insightsMetricsEventStream = nil - # end - if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @@ -746,7 +660,10 @@ def watch_pods if !key.nil? && !key.empty? nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" isWindowsPodItem = false - if !nodeName.empty? && !currentWindowsNodeNameList.nil? && !currentWindowsNodeNameList.empty? && currentWindowsNodeNameList.include?(nodeName) + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) isWindowsPodItem = true end podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) @@ -998,95 +915,69 @@ def watch_services $log.info("in_kube_podinventory::watch_services:End @ #{Time.now.utc.iso8601}") end - def watch_nodes - $log.info("in_kube_podinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}") + def watch_windows_nodes + $log.info("in_kube_podinventory::watch_windows_nodes:Start @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil loop do begin if nodesResourceVersion.nil? - # # clear node limits cache before filling the cache with list - # @nodeAllocatableCacheMutex.synchronize { - # @nodeAllocatableCache.clear() - # } @windowsNodeNameCacheMutex.synchronize { @windowsNodeNameListCache.clear() } continuationToken = nil - $log.info("in_kube_podinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}") continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_podinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty?) nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["name"] if !key.nil? && !key.empty? - # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) - if isWindowsNodeItem - @windowsNodeNameCacheMutex.synchronize { - if !@windowsNodeNameListCache.include?(key) - @windowsNodeNameListCache.push(key) - end - } - end - # if !nodeAllocatable.nil? && !nodeAllocatable.empty? - # @nodeAllocatableCacheMutex.synchronize { - # @nodeAllocatableCache[key] = nodeAllocatable - # } - # else - # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - # end + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") if (!nodeInventory.nil? && !nodeInventory.empty?) nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["name"] if !key.nil? && !key.empty? - # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) - if isWindowsNodeItem - @windowsNodeNameCacheMutex.synchronize { - if !@windowsNodeNameListCache.include?(key) - @windowsNodeNameListCache.push(key) - end - } - end - # if !nodeAllocatable.nil? && !nodeAllocatable.empty? - # @nodeAllocatableCacheMutex.synchronize { - # @nodeAllocatableCache[key] = nodeAllocatable - # } - # else - # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - # end + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } else - $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" end end end else - $log.warn "in_kube_podinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end end end begin - $log.info("in_kube_podinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) if watcher.nil? - $log.warn("in_kube_podinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") else watcher.each do |notice| case notice["type"] @@ -1097,73 +988,50 @@ def watch_nodes !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? nodesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_podinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need name key = item["metadata"]["name"] - isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) - if isWindowsNodeItem - @windowsNodeNameCacheMutex.synchronize { - if !@windowsNodeNameListCache.include?(key) - @windowsNodeNameListCache.push(key) - end - } - end - # if !key.nil? && !key.empty? - # nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - # if !nodeAllocatable.nil? && !nodeAllocatable.empty? - # @nodeAllocatableCacheMutex.synchronize { - # @nodeAllocatableCache[key] = nodeAllocatable - # } - # else - # $log.warn "in_kube_podinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - # end - # else - # $log.warn "in_kube_podinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - # end + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } elsif notice["type"] == "DELETED" key = item["metadata"]["name"] - isWindowsNodeItem = KubernetesApiClient.isWindowsNodeItem(item) - if isWindowsNodeItem - @windowsNodeNameCacheMutex.synchronize { - @windowsNodeNameListCache.delete(key) - } - end - # if !key.nil? && !key.empty? - # @nodeAllocatableCacheMutex.synchronize { - # @nodeAllocatableCache.delete(key) - # } - # end + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.delete(key) + } end when "ERROR" nodesResourceVersion = nil - $log.warn("in_kube_podinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") break else - $log.warn("in_kube_podinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end end rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr - $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil sleep(5) # do not overwhelm the api-server if api-server broken ensure watcher.finish if watcher end rescue => errorStr - $log.warn("in_kube_podinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil end end - $log.info("in_kube_podinventory::watch_nodes:End @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}") end end # Kube_Pod_Input end # module From 056ea8b4b1d1a77742cfe6625ef65f5b13049171 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 19 Jan 2022 11:17:36 -0800 Subject: [PATCH 19/65] minor update --- kubernetes/omsagent.yaml | 9 ++++++--- source/plugins/ruby/in_kube_podinventory.rb | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 8a4532035..ea282929a 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -615,10 +615,13 @@ spec: cpu: 150m memory: 250Mi env: - - name: EMIT_CACHE_TELEMETRY - value: "true" - name: NUM_OF_FLUENTD_WORKERS - value: "4" # This value should be same as number of CPU cores specified under limits + valueFrom: + resourceFieldRef: + containerName: omsagent + resource: limits.cpu + - name: EMIT_CACHE_TELEMETRY + value: "true" # enable only debug or test purpose and disable for prod - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index d466a7637..f0ddac0b8 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -995,7 +995,7 @@ def watch_windows_nodes # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break end - if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need name + if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name key = item["metadata"]["name"] @windowsNodeNameCacheMutex.synchronize { if !@windowsNodeNameListCache.include?(key) From b940e454573e56cfbff78563c76d44c9725f58ad Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 2 Feb 2022 11:20:55 -0800 Subject: [PATCH 20/65] remove commented code --- kubernetes/linux/main.sh | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index bea27379c..048dfcaa0 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -578,23 +578,6 @@ else echo "starting mdsd in main container..." # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>>/dev/null & - - ## TODO- evaluate again multiplace instances of mdsd - # echo "starting mdsd tenant instance 2 in main container..." - # echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in main container..." - # #use tenant name to avoid unix socket conflict and different ports for port conflict - # #roleprefix to use container specific mdsd socket - # MDSD_INSTANCE_ID="tenant2" - # export TENANT_NAME="${MDSD_INSTANCE_ID}" - # echo "export TENANT_NAME=$TENANT_NAME" >>~/.bashrc - # export MDSD_ROLE_PREFIX=/var/run/mdsd-${TENANT_NAME}/default - # echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >>~/.bashrc - # export MDSD_FLUENT_SOCKET_PORT_TENANT2="26230" - # echo "export MDSD_FLUENT_SOCKET_PORT_TENANT2=$MDSD_FLUENT_SOCKET_PORT_TENANT2" >>~/.bashrc - # source ~/.bashrc - # mkdir /var/run/mdsd-${MDSD_INSTANCE_ID} - # # add -T 0xFFFF for full traces - # mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd2.err -w ${MDSD_LOG}/mdsd2.warn -o ${MDSD_LOG}/mdsd2.info -q ${MDSD_LOG}/mdsd2.qos & fi # Set up a cron job for logrotation From 3a0cff2d5f34becee7a07d67b08737db47add8bb Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 5 Feb 2022 10:42:03 -0800 Subject: [PATCH 21/65] mdm state file --- source/plugins/ruby/constants.rb | 4 ++ source/plugins/ruby/in_kube_podinventory.rb | 48 ++++++++++++++++----- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index b9516c2ce..0b16e82f8 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -136,4 +136,8 @@ class Constants #This is for telemetry to track if any of the windows customer has any of the field size >= 64KB #To evaluate switching to Windows AMA 64KB impacts any existing customers MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 + + # FileName for MDM POD Inventory records + MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" + MDM_POD_INVENTORY_STATE_TEMP_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryStateTemp.json" end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index d6803fe3d..d7e1063ad 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -20,6 +20,7 @@ def initialize require "set" require "time" require "net/http" + require "fileutils" require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" @@ -150,6 +151,7 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 + @mdmPodRecords = {} podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") @@ -212,6 +214,7 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil + @mdmPodRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -239,7 +242,7 @@ def enumerate(podList = nil) ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount - telemetryProperties["WindowsNodeCount"] = @windowsNodeCount + telemetryProperties["WindowsNodeCount"] = @windowsNodeNameListCache.length telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024 telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore if @winContainerCountWithEnvVarSize64KBOrMore > 0 @@ -298,7 +301,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if KubernetesApiClient.isEmitCacheTelemetry() @windowsContainerRecordsCacheSizeBytes += containerInventoryRecords.to_s.length end - @windowsNodeCount = winNodes.length # Send container inventory records for containers on windows nodes @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| @@ -351,14 +353,11 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send - @log.info "Sending pod inventory mdm records to out_mdm" - pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = Fluent::MultiEventStream.new - pod_inventory_mdm_records.each { |pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es + if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 + mdmPodRecordsJson = @mdmPodRecords.to_s + @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" + atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson) + end end if continuationToken.nil? # sending kube services inventory records @@ -437,6 +436,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record = {} begin + mdmPodRecord = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Name"] = item["metadata"]["name"] podNameSpace = item["metadata"]["namespace"] @@ -512,7 +512,13 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["PodRestartCount"] = 0 #Invoke the helper method to compute ready/not ready mdm metric - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) + mdmPodRecord["PodUid"] = podUid + mdmPodRecord["ControllerName"] = record["ControllerName"] + mdmPodRecord["Namespace"] = record["Namespace"] + mdmPodRecord["status"] = {} + mdmPodRecord["status"]["conditions"] = item["status"]["conditions"] + mdmPodRecord["containeRecords"] = [] + #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) podContainers = [] if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? @@ -549,6 +555,13 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["ContainerRestartCount"] = containerRestartCount containerStatus = container["state"] + + mdmContainerRecord = {} + mdmContainerRecord["state"] = containerStatus + mdmContainerRecord["restartCount"] = containerRestartCount + mdmContainerRecord["lastState"] = container["lastState"] + mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup) + record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { @@ -629,6 +642,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) records.push(record) end #container status block end + @mdmPodRecords[podUid] = mdmPodRecord + records.each do |record| if !record.nil? record["PodRestartCount"] = podRestartCount @@ -1083,5 +1098,16 @@ def watch_windows_nodes end $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}") end + + def atomic_file_write(path, temp_path, content) + begin + File.open(temp_path, "w+") do |f| + f.write(content) + end + FileUtils.mv(temp_path, path) + rescue => err + $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}" + end + end end # Kube_Pod_Input end # module From a0e4498bdc559e9bffa03485963fba188bb51db4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 5 Feb 2022 18:47:38 -0800 Subject: [PATCH 22/65] mdm state file --- source/plugins/ruby/in_kube_podinventory.rb | 44 ++++++++++++++++----- source/plugins/ruby/podinventory_to_mdm.rb | 13 +----- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index d7e1063ad..9f8865e95 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -151,7 +151,7 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - @mdmPodRecords = {} + @mdmPodRecords = [] podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") @@ -354,7 +354,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 - mdmPodRecordsJson = @mdmPodRecords.to_s + mdmPodRecordsJson = @mdmPodRecords.to_json @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson) end @@ -513,10 +513,12 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Invoke the helper method to compute ready/not ready mdm metric mdmPodRecord["PodUid"] = podUid + mdmPodRecord["Computer"] = nodeName mdmPodRecord["ControllerName"] = record["ControllerName"] mdmPodRecord["Namespace"] = record["Namespace"] - mdmPodRecord["status"] = {} - mdmPodRecord["status"]["conditions"] = item["status"]["conditions"] + mdmPodRecord["PodStatus"] = record["PodStatus"] + mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"]) + mdmPodRecord["ControllerKind"] = record["ControllerKind"] mdmPodRecord["containeRecords"] = [] #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) @@ -557,11 +559,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) containerStatus = container["state"] mdmContainerRecord = {} - mdmContainerRecord["state"] = containerStatus - mdmContainerRecord["restartCount"] = containerRestartCount - mdmContainerRecord["lastState"] = container["lastState"] - mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup) - record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { @@ -586,6 +583,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) end # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB + mdmContainerRecord["state"] = containerStatus @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) end end @@ -614,6 +612,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled if lastStateReason.downcase == Constants::REASON_OOM_KILLED + mdmContainerRecord["lastState"] = container["lastState"] @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) end lastStateReason = nil @@ -626,6 +625,8 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Populate mdm metric for container restart count if greater than 0 if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + mdmContainerRecord["restartCount"] = containerRestartCount + mdmContainerRecord["lastState"] = container["lastState"] @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) end rescue => errorStr @@ -635,6 +636,10 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["ContainerLastStatus"] = Hash.new end + if !mdmContainerRecord.empty? + mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup) + end + podRestartCount += containerRestartCount records.push(record.dup) end @@ -642,7 +647,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) records.push(record) end #container status block end - @mdmPodRecords[podUid] = mdmPodRecord + @mdmPodRecords.push(mdmPodRecord.dup) records.each do |record| if !record.nil? @@ -1109,5 +1114,24 @@ def atomic_file_write(path, temp_path, content) $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}" end end + + def getPodReadyCondition(podStatusConditions) + podReadyCondition = false + begin + if !podStatusConditions.nil? && !podStatusConditions.empty? + podStatusConditions.each do |condition| + if condition["type"] == "Ready" + if condition["status"].downcase == "true" + podReadyCondition = true + end + break #Exit the for loop since we found the ready condition + end + end + end + rescue => err + $log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}" + end + return podReadyCondition + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index c24a91a87..278632cb0 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -218,24 +218,13 @@ def process_record_for_container_restarts_metric(podControllerNameDimValue, podN end end - def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podStatusConditions) + def process_record_for_pods_ready_metric(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition) if @process_incoming_stream begin @log.info "in process_record_for_pods_ready_metric..." if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = "No Controller" end - podReadyCondition = false - if !podStatusConditions.nil? && !podStatusConditions.empty? - podStatusConditions.each do |condition| - if condition["type"] == "Ready" - if condition["status"].downcase == "true" - podReadyCondition = true - end - break #Exit the for loop since we found the ready condition - end - end - end MdmMetricsGenerator.generatePodReadyMetrics(podControllerNameDimValue, podNamespaceDimValue, podReadyCondition) rescue => errorStr From de4f4b5d1245fec50c13d9ca804167d2ef5b70f1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 6 Feb 2022 09:18:06 -0800 Subject: [PATCH 23/65] podmdm to separate plugin --- build/linux/installer/conf/kube.conf | 49 ++--- kubernetes/linux/main.sh | 12 ++ source/plugins/ruby/in_kube_podinventory.rb | 9 - .../plugins/ruby/in_kube_podmdminventory.rb | 168 ++++++++++++++++++ 4 files changed, 208 insertions(+), 30 deletions(-) create mode 100644 source/plugins/ruby/in_kube_podmdminventory.rb diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 6f4d91fe6..c3cbe95b1 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -124,25 +124,6 @@ keepalive true - - - @type mdm - @id out_mdm_podinventory - @log_level debug - - @type file - path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length 20 - flush_interval 20s - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - retry_mdm_post_wait_minutes 30 - #Kubernetes Nodes @@ -264,6 +245,33 @@ keepalive true + + #Kubernetes podmdm inventory + + @type kube_pdmdminventory + run_interval 60 + @log_level debug + + + + @type mdm + @id out_mdm_podinventory + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + retry_mdm_post_wait_minutes 30 + + #fluent forward plugin # @@ -272,8 +280,7 @@ # bind 0.0.0.0 # chunk_size_limit 4m # - - #Kubernetes perf inventory + #Kubernetes perf inventory @type kube_perfinventory tag oneagent.containerInsights.LINUX_PERF_BLOB diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 048dfcaa0..c04fd8eac 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -593,17 +593,27 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else case $NUM_OF_FLUENTD_WORKERS in + 5) + export NUM_OF_FLUENTD_WORKERS=5 + export FLUENTD_POD_INVENTORY_WORKER_ID=4 + export FLUENTD_NODE_INVENTORY_WORKER_ID=3 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; 4) export NUM_OF_FLUENTD_WORKERS=4 export FLUENTD_POD_INVENTORY_WORKER_ID=3 export FLUENTD_NODE_INVENTORY_WORKER_ID=2 export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; 3) export NUM_OF_FLUENTD_WORKERS=3 export FLUENTD_POD_INVENTORY_WORKER_ID=2 export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; @@ -611,6 +621,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then export NUM_OF_FLUENTD_WORKERS=2 export FLUENTD_POD_INVENTORY_WORKER_ID=1 export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; @@ -620,6 +631,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then export FLUENTD_POD_INVENTORY_WORKER_ID=0 export FLUENTD_NODE_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 ;; esac diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 9f8865e95..6369471f4 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -4,12 +4,9 @@ require "fluent/plugin/input" module Fluent::Plugin - require_relative "podinventory_to_mdm" - class Kube_PodInventory_Input < Input Fluent::Plugin.register_input("kube_podinventory", self) - @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) def initialize @@ -69,7 +66,6 @@ def initialize def configure(conf) super - @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end def start @@ -285,7 +281,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords.each do |record| if !record.nil? eventStream.add(emitTime, record) if record - @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers @@ -520,7 +515,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"]) mdmPodRecord["ControllerKind"] = record["ControllerKind"] mdmPodRecord["containeRecords"] = [] - #@inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) podContainers = [] if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? @@ -584,7 +578,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB mdmContainerRecord["state"] = containerStatus - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) end end @@ -613,7 +606,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled if lastStateReason.downcase == Constants::REASON_OOM_KILLED mdmContainerRecord["lastState"] = container["lastState"] - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) end lastStateReason = nil else @@ -627,7 +619,6 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) mdmContainerRecord["restartCount"] = containerRestartCount mdmContainerRecord["lastState"] = container["lastState"] - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb new file mode 100644 index 000000000..9432e4fe0 --- /dev/null +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -0,0 +1,168 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require "fluent/plugin/input" + +module Fluent::Plugin + require_relative "podinventory_to_mdm" + + class Kube_PodMDMInventory_Input < Input + Fluent::Plugin.register_input("kube_podmdminventory", self) + + @@MDMKubePodInventoryTag = "mdm.kubepodinventory" + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "set" + require "time" + require "net/http" + require "fileutils" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + end + + config_param :run_interval, :time, :default => 60 + + def configure(conf) + super + @inventoryToMdmConvertor = Inventory2MdmConvertor.new() + end + + def start + if @run_interval + super + $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}") + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + super # This super must be at the end of shutdown method + end + end + + def enumerate + begin + batchTime = currentTime.utc.iso8601 + parse_and_emit_records(batchTime) + rescue => errorStr + $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def parse_and_emit_records(batchTime = Time.utc.iso8601) + currentTime = Time.now + begin + if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE) + content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE) + if !content.empty? + mdmPodRecords = Yajl::Parser.parse(StringIO.new(content)) + if !mdmPodRecords.nil? && !mdmPodRecords.empty? + mdmPodRecords.each do |record| + @inventoryToMdmConvertor.process_pod_inventory_record(record) + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) + containeRecords = record["containeRecords"] + if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 + containeRecords.each do |containerRecord| + if !containerRecord["state"].nil? && !containerRecord["state"].empty? + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) + end + begin + if !container["lastState"].nil? && container["lastState"].keys.length == 1 + lastStateName = container["lastState"].keys[0] + lastStateObject = container["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + lastStateReason = lastStateObject["reason"] + lastFinishedTime = lastStateObject["finishedAt"] + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil + end + end + containerRestartCount = containerRecord["restartCount"] + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => err + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" + $log.debug_backtrace(err.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(err) + end + end + end + end + @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + mdm_pod_inventory_es = Fluent::MultiEventStream.new + pod_inventory_mdm_records.each { |pod_inventory_mdm_record| + mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + } if pod_inventory_mdm_records + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es + end + end + else + $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}" + end + rescue => errorStr + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_podmdminventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_podmdminventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_podmdminventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + end # Kube_Pod_Input +end # module From 4ea1d698ee1cb3a86e85e8b7935ae40ff2c35acc Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 6 Feb 2022 19:03:37 -0800 Subject: [PATCH 24/65] bug fixes --- build/linux/installer/conf/kube.conf | 2 +- build/linux/installer/datafiles/base_container.data | 1 + kubernetes/linux/main.sh | 2 ++ source/plugins/ruby/in_kube_podmdminventory.rb | 10 ++++++---- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index c3cbe95b1..d8bcc53da 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -248,7 +248,7 @@ #Kubernetes podmdm inventory - @type kube_pdmdminventory + @type kube_podmdminventory run_interval 60 @log_level debug diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 650b19243..328a846c7 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -162,6 +162,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root /etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root /etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_podmdminventory.rb; source/plugins/ruby/in_kube_podmdminventory.rb; 644; root; root /etc/fluent/plugin/in_kube_perfinventory.rb; source/plugins/ruby/in_kube_perfinventory.rb; 644; root; root /etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root /etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c04fd8eac..c280a31a0 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -640,6 +640,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc source ~/.bashrc echo "*** fluentd worker configuration ***" @@ -647,6 +648,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" + echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "*** starting fluentd v1 in replicaset" diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 9432e4fe0..84badf112 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -57,6 +57,7 @@ def shutdown def enumerate begin + currentTime = Time.now batchTime = currentTime.utc.iso8601 parse_and_emit_records(batchTime) rescue => errorStr @@ -67,12 +68,13 @@ def enumerate end def parse_and_emit_records(batchTime = Time.utc.iso8601) - currentTime = Time.now begin if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE) content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE) if !content.empty? + $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" mdmPodRecords = Yajl::Parser.parse(StringIO.new(content)) + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" if !mdmPodRecords.nil? && !mdmPodRecords.empty? mdmPodRecords.each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @@ -84,9 +86,9 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) end begin - if !container["lastState"].nil? && container["lastState"].keys.length == 1 - lastStateName = container["lastState"].keys[0] - lastStateObject = container["lastState"][lastStateName] + if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 + lastStateName = containerRecord["lastState"].keys[0] + lastStateObject = containerRecord["lastState"][lastStateName] if !lastStateObject.is_a?(Hash) raise "expected a hash object. This could signify a bug or a kubernetes API change" end From 45d3e03653b45a62615fa5969e1c2c5554e4d6ea Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 6 Feb 2022 20:41:20 -0800 Subject: [PATCH 25/65] bug fixes --- source/plugins/ruby/in_kube_podinventory.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 6369471f4..7b52b97dd 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -347,11 +347,13 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerInventoryStream = nil end - if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send + if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 mdmPodRecordsJson = @mdmPodRecords.to_json @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" + @log.info "in_kube_podinventory::parse_and_emit_records:Start:atomic_file_write @ #{Time.now.utc.iso8601}" atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson) + @log.info "in_kube_podinventory::parse_and_emit_records:End:atomic_file_write @ #{Time.now.utc.iso8601}" end end From 5481e4870e4fadb5a315d5426266efbef7322767 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 7 Feb 2022 08:26:36 -0800 Subject: [PATCH 26/65] bug fixes --- .../plugins/ruby/in_kube_podmdminventory.rb | 99 ++++++++++--------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 84badf112..30337c9b7 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -70,60 +70,61 @@ def enumerate def parse_and_emit_records(batchTime = Time.utc.iso8601) begin if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE) - content = File.read(Constants::MDM_POD_INVENTORY_STATE_FILE) - if !content.empty? - $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" - mdmPodRecords = Yajl::Parser.parse(StringIO.new(content)) - $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" - if !mdmPodRecords.nil? && !mdmPodRecords.empty? - mdmPodRecords.each do |record| - @inventoryToMdmConvertor.process_pod_inventory_record(record) - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) - containeRecords = record["containeRecords"] - if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 - containeRecords.each do |containerRecord| - if !containerRecord["state"].nil? && !containerRecord["state"].empty? - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) - end - begin - if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 - lastStateName = containerRecord["lastState"].keys[0] - lastStateObject = containerRecord["lastState"][lastStateName] - if !lastStateObject.is_a?(Hash) - raise "expected a hash object. This could signify a bug or a kubernetes API change" - end - if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - lastStateReason = lastStateObject["reason"] - lastFinishedTime = lastStateObject["finishedAt"] - #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled - if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - lastStateReason = nil - end + file = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") + $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" + mdmPodRecords = Yajl::Parser.parse(file) + if !file.nil? + file.close + end + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" + if !mdmPodRecords.nil? && !mdmPodRecords.empty? + mdmPodRecords.each do |record| + @inventoryToMdmConvertor.process_pod_inventory_record(record) + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) + containeRecords = record["containeRecords"] + if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 + containeRecords.each do |containerRecord| + if !containerRecord["state"].nil? && !containerRecord["state"].empty? + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) + end + begin + if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 + lastStateName = containerRecord["lastState"].keys[0] + lastStateObject = containerRecord["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" end - containerRestartCount = containerRecord["restartCount"] - #Populate mdm metric for container restart count if greater than 0 - if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + lastStateReason = lastStateObject["reason"] + lastFinishedTime = lastStateObject["finishedAt"] + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil end - rescue => err - $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" - $log.debug_backtrace(err.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(err) end + containerRestartCount = containerRecord["restartCount"] + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => err + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" + $log.debug_backtrace(err.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(err) end end end - @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" - pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = Fluent::MultiEventStream.new - pod_inventory_mdm_records.each { |pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end + @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + mdm_pod_inventory_es = Fluent::MultiEventStream.new + pod_inventory_mdm_records.each { |pod_inventory_mdm_record| + mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + } if pod_inventory_mdm_records + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end else $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}" @@ -132,6 +133,10 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + ensure + if !file.nil? + file.close + end end end From 1ea93668f803f77661987b7edb867fd106fed663 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 7 Feb 2022 19:46:59 -0800 Subject: [PATCH 27/65] podmdm plugin --- source/plugins/ruby/constants.rb | 1 - source/plugins/ruby/in_kube_perfinventory.rb | 2 - source/plugins/ruby/in_kube_podinventory.rb | 38 ++++-- .../plugins/ruby/in_kube_podmdminventory.rb | 129 +++++++++++------- 4 files changed, 107 insertions(+), 63 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 0b16e82f8..5576d9917 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -139,5 +139,4 @@ class Constants # FileName for MDM POD Inventory records MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" - MDM_POD_INVENTORY_STATE_TEMP_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryStateTemp.json" end diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 7403b86f3..9733130af 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -155,7 +155,6 @@ def parse_and_emit_records(podInventory, nodeAllocatableRecords, continuationTok @@istestvar = ENV["ISTEST"] begin #begin block start - # # Getting windows nodes from kubeapi podInventory["items"].each do |item| #podInventory block start nodeName = "" if !item["spec"]["nodeName"].nil? @@ -272,7 +271,6 @@ def watch_pods @podCacheMutex.synchronize { @podItemsCache.clear() } - currentWindowsNodeNameList = [] continuationToken = nil $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 7b52b97dd..8432965a4 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -351,9 +351,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 mdmPodRecordsJson = @mdmPodRecords.to_json @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" - @log.info "in_kube_podinventory::parse_and_emit_records:Start:atomic_file_write @ #{Time.now.utc.iso8601}" - atomic_file_write(Constants::MDM_POD_INVENTORY_STATE_FILE, Constants::MDM_POD_INVENTORY_STATE_TEMP_FILE, mdmPodRecordsJson) - @log.info "in_kube_podinventory::parse_and_emit_records:End:atomic_file_write @ #{Time.now.utc.iso8601}" + @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" + writeMDMRecords(mdmPodRecordsJson) + @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" end end @@ -1097,14 +1097,36 @@ def watch_windows_nodes $log.info("in_kube_podinventory::watch_windows_nodes:End @ #{Time.now.utc.iso8601}") end - def atomic_file_write(path, temp_path, content) + def writeMDMRecords(mdmRecordsJson) + maxRetryCount = 3 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 begin - File.open(temp_path, "w+") do |f| - f.write(content) + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + f.truncate(0) + f.write(mdmRecordsJson) + f.flush + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}" + else + raise "writeMDMRecords:Failed to open file for write" end - FileUtils.mv(temp_path, path) rescue => err - $log.warn "in_kube_podinventory::atomic_file_write: failed with an error: #{err}" + if retryAttemptCount < MaxRetryCount + retryAttemptCount = retryAttemptCount + 1 + sleep (initialRetryDelay * retryAttemptCount) + retry + end + $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ensure + if !f.nil? + f.flock(File::LOCK_UN) + f.close + end end end diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 30337c9b7..2afa9a547 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -69,65 +69,57 @@ def enumerate def parse_and_emit_records(batchTime = Time.utc.iso8601) begin - if File.exists?(Constants::MDM_POD_INVENTORY_STATE_FILE) - file = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") - $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" - mdmPodRecords = Yajl::Parser.parse(file) - if !file.nil? - file.close - end - $log.info "in_kube_podmdminventory:parse_and_emit_records:End:Parsing MDM pod records using yajl @ #{Time.now.utc.iso8601}" - if !mdmPodRecords.nil? && !mdmPodRecords.empty? - mdmPodRecords.each do |record| - @inventoryToMdmConvertor.process_pod_inventory_record(record) - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) - containeRecords = record["containeRecords"] - if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 - containeRecords.each do |containerRecord| - if !containerRecord["state"].nil? && !containerRecord["state"].empty? - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) - end - begin - if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 - lastStateName = containerRecord["lastState"].keys[0] - lastStateObject = containerRecord["lastState"][lastStateName] - if !lastStateObject.is_a?(Hash) - raise "expected a hash object. This could signify a bug or a kubernetes API change" - end - if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - lastStateReason = lastStateObject["reason"] - lastFinishedTime = lastStateObject["finishedAt"] - #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled - if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - lastStateReason = nil - end + $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}" + mdmPodRecords = readMDMRecords() + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}" + if !mdmPodRecords.nil? && !mdmPodRecords.empty? + mdmPodRecords.each do |record| + @inventoryToMdmConvertor.process_pod_inventory_record(record) + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) + containeRecords = record["containeRecords"] + if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 + containeRecords.each do |containerRecord| + if !containerRecord["state"].nil? && !containerRecord["state"].empty? + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) + end + begin + if !containerRecord["lastState"].nil? && containerRecord["lastState"].keys.length == 1 + lastStateName = containerRecord["lastState"].keys[0] + lastStateObject = containerRecord["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" end - containerRestartCount = containerRecord["restartCount"] - #Populate mdm metric for container restart count if greater than 0 - if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + lastStateReason = lastStateObject["reason"] + lastFinishedTime = lastStateObject["finishedAt"] + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil end - rescue => err - $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" - $log.debug_backtrace(err.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(err) end + containerRestartCount = containerRecord["restartCount"] + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => err + $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed while processing ContainerLastStatus: #{err}" + $log.debug_backtrace(err.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(err) end end end - @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" - pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = Fluent::MultiEventStream.new - pod_inventory_mdm_records.each { |pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end - else - $log.warn "in_kube_podmdminventory:parse_and_emit_records:MDM pod inventory state file doesnt exist @ #{Time.now.utc.iso8601}" + @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + mdm_pod_inventory_es = Fluent::MultiEventStream.new + pod_inventory_mdm_records.each { |pod_inventory_mdm_record| + mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + } if pod_inventory_mdm_records + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end rescue => errorStr $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" @@ -171,5 +163,38 @@ def run_periodic end @mutex.unlock end + + def readMDMRecords() + maxRetryCount = 3 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + mdmRecords = {} + begin + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "readMDMRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + mdmRecords = Yajl::Parser.parse(f) + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + else + raise "readMDMRecords:Failed to open file for read" + end + rescue => err + if retryAttemptCount < MaxRetryCount + retryAttemptCount = retryAttemptCount + 1 + sleep (initialRetryDelay * retryAttemptCount) + retry + end + $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ensure + if !f.nil? + f.flock(File::LOCK_UN) + f.close + end + end + return mdmRecords + end end # Kube_Pod_Input end # module From a12e535c52fdd8a42cf4bd444118e4898be33fc5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 8 Feb 2022 00:21:44 -0800 Subject: [PATCH 28/65] bug fixes --- build/linux/installer/conf/kube.conf | 2 +- source/plugins/ruby/in_kube_podinventory.rb | 21 +++++----- .../plugins/ruby/in_kube_podmdminventory.rb | 39 ++++++++++--------- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index d8bcc53da..dcdf1cdf8 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -262,7 +262,7 @@ path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length 50 flush_interval 20s retry_max_times 10 retry_wait 5s diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 8432965a4..aac0247c3 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -516,7 +516,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) mdmPodRecord["PodStatus"] = record["PodStatus"] mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"]) mdmPodRecord["ControllerKind"] = record["ControllerKind"] - mdmPodRecord["containeRecords"] = [] + mdmPodRecord["containerRecords"] = [] podContainers = [] if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? @@ -630,7 +630,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) end if !mdmContainerRecord.empty? - mdmPodRecord["containeRecords"].push(mdmContainerRecord.dup) + mdmPodRecord["containerRecords"].push(mdmContainerRecord.dup) end podRestartCount += containerRestartCount @@ -1105,28 +1105,27 @@ def writeMDMRecords(mdmRecordsJson) f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i - f.truncate(0) f.write(mdmRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}" else - raise "writeMDMRecords:Failed to open file for write" + raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write" end rescue => err - if retryAttemptCount < MaxRetryCount + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? retryAttemptCount = retryAttemptCount + 1 - sleep (initialRetryDelay * retryAttemptCount) + sleep (initialRetryDelaySecs * retryAttemptCount) retry end $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" ensure - if !f.nil? - f.flock(File::LOCK_UN) - f.close - end + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? end end diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 2afa9a547..a23a84c9b 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -72,13 +72,13 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}" mdmPodRecords = readMDMRecords() $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}" - if !mdmPodRecords.nil? && !mdmPodRecords.empty? + if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmRecords.length > 0 mdmPodRecords.each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) - containeRecords = record["containeRecords"] - if !containeRecords.nil? && !containeRecords.empty? && containeRecords.length > 0 - containeRecords.each do |containerRecord| + containerRecords = record["containerRecords"] + if !containerRecords.nil? && !containerRecords.empty? && containerRecords.length > 0 + containerRecords.each do |containerRecord| if !containerRecord["state"].nil? && !containerRecord["state"].empty? @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerRecord["state"]) end @@ -118,17 +118,20 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) mdm_pod_inventory_es = Fluent::MultiEventStream.new pod_inventory_mdm_records.each { |pod_inventory_mdm_record| mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + if mdm_pod_inventory_es.count >= 5000 # 5k records of MDM is ~2MB and each record is ~400 bytes + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) + mdm_pod_inventory_es = Fluent::MultiEventStream.new + end } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es + if mdm_pod_inventory_es.count > 0 + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) + end + mdm_pod_inventory_es = nil end rescue => errorStr $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - ensure - if !file.nil? - file.close - end end end @@ -168,31 +171,31 @@ def readMDMRecords() maxRetryCount = 3 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 - mdmRecords = {} + mdmRecords = [] begin f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "readMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podmdminventory:readMDMRecords:Failed to acquire file lock" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "readMDMRecords:Failed to open file for read" + raise "in_kube_podmdminventory:readMDMRecords:Failed to open file for read" end rescue => err - if retryAttemptCount < MaxRetryCount + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? retryAttemptCount = retryAttemptCount + 1 - sleep (initialRetryDelay * retryAttemptCount) + sleep (initialRetryDelaySecs * retryAttemptCount) retry end $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" ensure - if !f.nil? - f.flock(File::LOCK_UN) - f.close - end + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? end return mdmRecords end From 03e0b439acbd68ee3f61bf8969b09fd4bdbe7837 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 8 Feb 2022 11:18:37 -0800 Subject: [PATCH 29/65] bug fixes --- source/plugins/ruby/in_kube_podmdminventory.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index a23a84c9b..971197f49 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -72,7 +72,7 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}" mdmPodRecords = readMDMRecords() $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}" - if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmRecords.length > 0 + if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0 mdmPodRecords.each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) From ab27436bd6decec7e90e51999697a842937727a2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 8 Feb 2022 12:51:19 -0800 Subject: [PATCH 30/65] remove unneeded log lines --- source/plugins/ruby/in_kube_nodes.rb | 5 +++-- source/plugins/ruby/in_kube_perfinventory.rb | 8 +++---- source/plugins/ruby/in_kube_podinventory.rb | 23 ++++++++++---------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 3e8e8ee71..146da8f9d 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -652,7 +652,7 @@ def watch_nodes !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? nodesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") else $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil @@ -691,7 +691,8 @@ def watch_nodes end end rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 9733130af..c37c3ce0e 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -340,9 +340,9 @@ def watch_pods !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? podsResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") podsResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break @@ -377,11 +377,11 @@ def watch_pods $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end - $log.info("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end rescue Net::ReadTimeout => errorStr ## This expected if there is no activity more than readtimeout value used in the connection - $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") podsResourceVersion = nil diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index aac0247c3..24eea4dbf 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -795,9 +795,9 @@ def watch_pods !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? podsResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") podsResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break @@ -844,11 +844,11 @@ def watch_pods $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end - $log.info("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity more than readtimeout value used in the connection - $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") podsResourceVersion = nil @@ -923,9 +923,9 @@ def watch_services !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? servicesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break @@ -962,7 +962,7 @@ def watch_services end end rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil @@ -1051,9 +1051,9 @@ def watch_windows_nodes !item["metadata"].nil? && !item["metadata"].empty? && !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? nodesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") else - $log.info("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! break @@ -1081,7 +1081,8 @@ def watch_windows_nodes end end rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + ## This expected if there is no activity more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil From 541e50da0b3f1cb2433c4be5c11da50e8ba06eb2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 8 Feb 2022 19:06:29 -0800 Subject: [PATCH 31/65] more improvements --- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/in_kube_nodes.rb | 53 ++++++ source/plugins/ruby/in_kube_perfinventory.rb | 170 ++++-------------- source/plugins/ruby/in_kube_podinventory.rb | 1 + .../plugins/ruby/in_kube_podmdminventory.rb | 17 +- 5 files changed, 95 insertions(+), 147 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 5576d9917..6f8c1256f 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -139,4 +139,5 @@ class Constants # FileName for MDM POD Inventory records MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" + NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json" end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 146da8f9d..d3077e713 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -202,10 +202,19 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream = Fluent::MultiEventStream.new kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = @env["ISTEST"] + nodeAllocatableRecords = {} #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) + # node allocatble records for the kube perf plugin + nodeName = item["metadata"]["name"] + if !nodeName.nil? && !nodeName.empty? + nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) + if !nodeAllocatable.nil? && !nodeAllocatable.empty? + nodeAllocatableRecords[nodeName] = nodeAllocatable + end + end eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") @@ -425,6 +434,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end end + if !nodeAllocatableRecords.nil? && !nodeAllocatableRecords.empty? + nodeAllocatableRecordsJson = nodeAllocatableRecords.to_json + if !nodeAllocatableRecordsJson.empty? + @log.info "Writing node allocatable records to state file with size(bytes): #{nodeAllocatableRecordsJson.length}" + @log.info "in_kube_nodes::parse_and_emit_records:Start:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}" + writeNodeAllocatableRecords(nodeAllocatableRecordsJson) + @log.info "in_kube_nodes::parse_and_emit_records:End:writeNodeAllocatableRecords @ #{Time.now.utc.iso8601}" + end + nodeAllocatableRecordsJson = nil + nodeAllocatableRecords = nil + end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -707,6 +727,39 @@ def watch_nodes end $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}") end + + def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) + maxRetryCount = 3 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + begin + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + f.write(nodeAllocatbleRecordsJson) + f.flush + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}" + else + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write" + end + rescue => err + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + retryAttemptCount = retryAttemptCount + 1 + sleep (initialRetryDelaySecs * retryAttemptCount) + retry + end + $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + end + end end # Kube_Node_Input class NodeStatsCache diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index c37c3ce0e..5faae3194 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -33,7 +33,6 @@ def initialize @podItemsCache = {} @watchNodesThread = nil - @nodeAllocatableCache = {} @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" @@ -80,9 +79,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @podCacheMutex = Mutex.new - @nodeAllocatableCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @watchNodesThread = Thread.new(&method(:watch_nodes)) @watchPodsThread = Thread.new(&method(:watch_pods)) end end @@ -118,11 +115,7 @@ def enumerate(podList = nil) $log.info("in_kube_perfinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") end - nodeAllocatableRecords = {} - nodeAllocatableCacheSizeKB = 0 - @nodeAllocatableCacheMutex.synchronize { - nodeAllocatableRecords = @nodeAllocatableCache.clone - } + nodeAllocatableRecords = getNodeAllocatableRecords() $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") # Initializing continuation token to nil continuationToken = nil @@ -397,139 +390,38 @@ def watch_pods $log.info("in_kube_perfinventory::watch_pods:End @ #{Time.now.utc.iso8601}") end - def watch_nodes - $log.info("in_kube_perfinventory::watch_nodes:Start @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - loop do - begin - if nodesResourceVersion.nil? - # clear node limits cache before filling the cache with list - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache.clear() - } - continuationToken = nil - $log.info("in_kube_perfinventory::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_perfinventory::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_perfinventory::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| - key = item["metadata"]["name"] - if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end - else - $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" - end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_perfinventory::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| - key = item["metadata"]["name"] - if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end - else - $log.warn "in_kube_perfinventory::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" - end - end - end - begin - $log.info("in_kube_perfinventory::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_perfinventory::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - nodesResourceVersion = item["metadata"]["resourceVersion"] - $log.info("in_kube_perfinventory::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_perfinventory::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["name"] - if !key.nil? && !key.empty? - nodeAllocatable = KubernetesApiClient.getNodeAllocatableValues(item) - if !nodeAllocatable.nil? && !nodeAllocatable.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache[key] = nodeAllocatable - } - else - $log.warn "in_kube_perfinventory::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_perfinventory::watch_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["name"] - if !key.nil? && !key.empty? - @nodeAllocatableCacheMutex.synchronize { - @nodeAllocatableCache.delete(key) - } - end - end - when "ERROR" - nodesResourceVersion = nil - $log.warn("in_kube_perfinventory::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_perfinventory::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") - end - end - end - rescue Net::ReadTimeout => errorStr - $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher - end - rescue => errorStr - $log.warn("in_kube_perfinventory::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil + def getNodeAllocatableRecords() + maxRetryCount = 3 + initialRetryDelaySecs = 0.5 + retryAttemptCount = 1 + nodeAllocatableRecords = {} + begin + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r") + if !f.nil? + isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + startTime = (Time.now.to_f * 1000).to_i + nodeAllocatableRecords = Yajl::Parser.parse(f) + timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + else + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read" + end + rescue => err + if retryAttemptCount < maxRetryCount + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? + retryAttemptCount = retryAttemptCount + 1 + sleep (initialRetryDelaySecs * retryAttemptCount) + retry end + $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) + ensure + f.flock(File::LOCK_UN) if !f.nil? + f.close if !f.nil? end - $log.info("in_kube_perfinventory::watch_nodes:End @ #{Time.now.utc.iso8601}") + return nodeAllocatableRecords end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 24eea4dbf..70167e012 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1124,6 +1124,7 @@ def writeMDMRecords(mdmRecordsJson) retry end $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) ensure f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 971197f49..98f06dc0c 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -69,9 +69,9 @@ def enumerate def parse_and_emit_records(batchTime = Time.utc.iso8601) begin - $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:readMDMRecords @ #{Time.now.utc.iso8601}" - mdmPodRecords = readMDMRecords() - $log.info "in_kube_podmdminventory:parse_and_emit_records:End:readMDMRecords @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}" + mdmPodRecords = getMDMRecords() + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0 mdmPodRecords.each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @@ -167,7 +167,7 @@ def run_periodic @mutex.unlock end - def readMDMRecords() + def getMDMRecords() maxRetryCount = 3 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 @@ -176,13 +176,13 @@ def readMDMRecords() f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_podmdminventory:readMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_podmdminventory:readMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_podmdminventory:readMDMRecords:Failed to open file for read" + raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" end rescue => err if retryAttemptCount < maxRetryCount @@ -192,7 +192,8 @@ def readMDMRecords() sleep (initialRetryDelaySecs * retryAttemptCount) retry end - $log.warn "in_kube_podmdminventory:readMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" + ApplicationInsightsUtility.sendExceptionTelemetry(err) ensure f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? From 589b69a9472eed0321781afa48ebf4daca586d71 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 9 Feb 2022 14:34:05 -0800 Subject: [PATCH 32/65] clean up --- kubernetes/omsagent.yaml | 2 +- source/plugins/ruby/KubernetesApiClient.rb | 19 ++++++++++ source/plugins/ruby/constants.rb | 3 +- source/plugins/ruby/in_kube_perfinventory.rb | 13 ------- source/plugins/ruby/in_kube_podinventory.rb | 37 ++++++-------------- 5 files changed, 33 insertions(+), 41 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 5c9e8f853..b0ccb6712 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -609,7 +609,7 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 4 + cpu: 5 memory: 2Gi requests: cpu: 150m diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index f1afd4ac6..0d4267685 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -1310,6 +1310,25 @@ def getHpaOptimizedItem(resourceItem) return item end + def getPodReadyCondition(podStatusConditions) + podReadyCondition = false + begin + if !podStatusConditions.nil? && !podStatusConditions.empty? + podStatusConditions.each do |condition| + if condition["type"] == "Ready" + if condition["status"].downcase == "true" + podReadyCondition = true + end + break #Exit the for loop since we found the ready condition + end + end + end + rescue => err + @Log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}" + end + return podReadyCondition + end + def isEmitCacheTelemetry isEmitCacheTelemtryEnabled = false if !ENV["EMIT_CACHE_TELEMETRY"].nil? && !ENV["EMIT_CACHE_TELEMETRY"].empty? && ENV["EMIT_CACHE_TELEMETRY"].downcase == "true" diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 6f8c1256f..ca966fb12 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -137,7 +137,8 @@ class Constants #To evaluate switching to Windows AMA 64KB impacts any existing customers MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 - # FileName for MDM POD Inventory records + # FileName for MDM POD Inventory state MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" + # FileName for NodeAllocatable Records state NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json" end diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 5faae3194..00f7b02db 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -27,13 +27,10 @@ def initialize # this configurable via configmap @PODS_CHUNK_SIZE = 0 @PODS_EMIT_STREAM_BATCH_SIZE = 0 - @NODES_CHUNK_SIZE = 0 @watchPodsThread = nil @podItemsCache = {} - @watchNodesThread = nil - @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end @@ -66,15 +63,6 @@ def start end $log.info("in_kube_perfinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") - if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 - @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i - else - # this shouldnt happen just setting default here as safe guard - $log.warn("in_kube_perfinventory::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") - @NODES_CHUNK_SIZE = 250 - end - $log.info("in_kube_perfinventory::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") - @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -92,7 +80,6 @@ def shutdown } @thread.join @watchPodsThread.join - @watchNodesThread.join super # This super must be at the end of shutdown method end end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 70167e012..905fd0e34 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -348,12 +348,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send - if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 - mdmPodRecordsJson = @mdmPodRecords.to_json - @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" - @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" - writeMDMRecords(mdmPodRecordsJson) - @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" + begin + if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 + mdmPodRecordsJson = @mdmPodRecords.to_json + @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" + @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" + writeMDMRecords(mdmPodRecordsJson) + @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" + end + rescue => err + @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}" end end @@ -514,7 +518,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) mdmPodRecord["ControllerName"] = record["ControllerName"] mdmPodRecord["Namespace"] = record["Namespace"] mdmPodRecord["PodStatus"] = record["PodStatus"] - mdmPodRecord["PodReadyCondition"] = getPodReadyCondition(item["status"]["conditions"]) + mdmPodRecord["PodReadyCondition"] = KubernetesApiClient.getPodReadyCondition(item["status"]["conditions"]) mdmPodRecord["ControllerKind"] = record["ControllerKind"] mdmPodRecord["containerRecords"] = [] @@ -1130,24 +1134,5 @@ def writeMDMRecords(mdmRecordsJson) f.close if !f.nil? end end - - def getPodReadyCondition(podStatusConditions) - podReadyCondition = false - begin - if !podStatusConditions.nil? && !podStatusConditions.empty? - podStatusConditions.each do |condition| - if condition["type"] == "Ready" - if condition["status"].downcase == "true" - podReadyCondition = true - end - break #Exit the for loop since we found the ready condition - end - end - end - rescue => err - $log.warn "in_kube_podinventory::getPodReadyCondition failed with an error: #{err}" - end - return podReadyCondition - end end # Kube_Pod_Input end # module From 37d67b859c6ced09472340584040407f8c7a9897 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 9 Feb 2022 16:33:49 -0800 Subject: [PATCH 33/65] clean up --- .../templates/omsagent-deployment.yaml | 5 + kubernetes/linux/main.sh | 126 +++++++++--------- source/plugins/ruby/in_kube_podinventory.rb | 6 +- 3 files changed, 72 insertions(+), 65 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index a7ea8b097..ac7cafa13 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -43,6 +43,11 @@ spec: resources: {{ toYaml .Values.omsagent.resources.deployment | indent 9 }} env: + - name: NUM_OF_FLUENTD_WORKERS + valueFrom: + resourceFieldRef: + containerName: omsagent + resource: limits.cpu {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.omsagent.env.clusterId | quote }} diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c280a31a0..5f3c4c902 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -80,6 +80,66 @@ checkAgentOnboardingStatus() { fi } +configureFluentDWorkerIDsForRS() { + echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" + case $NUM_OF_FLUENTD_WORKERS in + 5) + export NUM_OF_FLUENTD_WORKERS=5 + export FLUENTD_POD_INVENTORY_WORKER_ID=4 + export FLUENTD_NODE_INVENTORY_WORKER_ID=3 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + 4) + export NUM_OF_FLUENTD_WORKERS=4 + export FLUENTD_POD_INVENTORY_WORKER_ID=3 + export FLUENTD_NODE_INVENTORY_WORKER_ID=2 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + 3) + export NUM_OF_FLUENTD_WORKERS=3 + export FLUENTD_POD_INVENTORY_WORKER_ID=2 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + 2) + export NUM_OF_FLUENTD_WORKERS=2 + export FLUENTD_POD_INVENTORY_WORKER_ID=1 + export FLUENTD_NODE_INVENTORY_WORKER_ID=1 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + + *) + export NUM_OF_FLUENTD_WORKERS=1 + export FLUENTD_POD_INVENTORY_WORKER_ID=0 + export FLUENTD_NODE_INVENTORY_WORKER_ID=0 + export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 + export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 + export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + ;; + esac + echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc + echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc + source ~/.bashrc + + echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" + echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" + echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" + echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" + echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" +} + #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -202,7 +262,7 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then export MDSD_PROXY_USERNAME=$user echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >>~/.bashrc export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password - echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc + echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD export MDSD_ODS_COMPRESSION_LEVEL=0 @@ -434,7 +494,6 @@ fi export CONTAINER_RUNTIME="containerd" export NODE_NAME="" - if [ "$cAdvisorIsSecure" = true ]; then echo "Using port 10250" export IS_SECURE_CADVISOR_PORT=true @@ -460,7 +519,7 @@ if [ ! -z "$podWithValidContainerId" ]; then containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]") nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]") # use default container runtime if obtained runtime value is either empty or null - if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then + if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null" else export CONTAINER_RUNTIME=$containerRuntime @@ -592,65 +651,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "*** starting fluentd v1 in daemonset" fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else - case $NUM_OF_FLUENTD_WORKERS in - 5) - export NUM_OF_FLUENTD_WORKERS=5 - export FLUENTD_POD_INVENTORY_WORKER_ID=4 - export FLUENTD_NODE_INVENTORY_WORKER_ID=3 - export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 - export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 - export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - ;; - 4) - export NUM_OF_FLUENTD_WORKERS=4 - export FLUENTD_POD_INVENTORY_WORKER_ID=3 - export FLUENTD_NODE_INVENTORY_WORKER_ID=2 - export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 - export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 - export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - ;; - 3) - export NUM_OF_FLUENTD_WORKERS=3 - export FLUENTD_POD_INVENTORY_WORKER_ID=2 - export FLUENTD_NODE_INVENTORY_WORKER_ID=1 - export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 - export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 - export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - ;; - 2) - export NUM_OF_FLUENTD_WORKERS=2 - export FLUENTD_POD_INVENTORY_WORKER_ID=1 - export FLUENTD_NODE_INVENTORY_WORKER_ID=1 - export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 - export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 - export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - ;; - - *) - export NUM_OF_FLUENTD_WORKERS=1 - export FLUENTD_POD_INVENTORY_WORKER_ID=0 - export FLUENTD_NODE_INVENTORY_WORKER_ID=0 - export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 - export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 - export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - ;; - esac - echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc - echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc - source ~/.bashrc - - echo "*** fluentd worker configuration ***" - echo "num of workers:${NUM_OF_FLUENTD_WORKERS}" - echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" - echo "node inventory worker id: ${FLUENTD_NODE_INVENTORY_WORKER_ID}" - echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" - echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" - echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" - + echo "*** configure fluentd worker ids" + configureFluentDWorkerIDsForRS echo "*** starting fluentd v1 in replicaset" fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 905fd0e34..455444f85 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -53,7 +53,7 @@ def initialize @watchServicesThread = nil @serviceItemsCache = {} - @watchNodesThread = nil + @watchWinNodesThread = nil @windowsNodeNameListCache = [] @windowsContainerRecordsCacheSizeBytes = 0 @@ -105,7 +105,7 @@ def start @serviceCacheMutex = Mutex.new @windowsNodeNameCacheMutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @watchNodesThread = Thread.new(&method(:watch_windows_nodes)) + @watchWinNodesThread = Thread.new(&method(:watch_windows_nodes)) @watchPodsThread = Thread.new(&method(:watch_pods)) @watchServicesThread = Thread.new(&method(:watch_services)) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -121,7 +121,7 @@ def shutdown @thread.join @watchPodsThread.join @watchServicesThread.join - @watchNodesThread.join + @watchWinNodesThread.join super # This super must be at the end of shutdown method end end From 886557b87c834756951ac0497bb9355aa371abe5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 10 Feb 2022 23:05:36 -0800 Subject: [PATCH 34/65] add requestId header for mdm metrics --- source/plugins/ruby/out_mdm.rb | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 82d6e07db..a542c5eb0 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/output' +require "fluent/plugin/output" module Fluent::Plugin class OutputMDM < Output @@ -12,6 +12,7 @@ def initialize super require "net/http" require "net/https" + require "securerandom" require "uri" require "yajl/json_gem" require_relative "KubernetesApiClient" @@ -326,47 +327,49 @@ def send_to_mdm(post_body) else access_token = get_access_token end + requestId = SecureRandom.uuid.to_s request = Net::HTTP::Post.new(@post_request_uri.request_uri) request["Content-Type"] = "application/x-ndjson" request["Authorization"] = "Bearer #{access_token}" + request["x-request-id"] = requestId request.body = post_body.join("\n") - @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" + @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}" response = @http_client.request(request) response.value # this throws for non 200 HTTP response code - @log.info "HTTP Post Response Code : #{response.code}" + @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error - @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" + @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response.body: #{response.body}" else - @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" + @log.info "Failed to Post Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" end @log.debug_backtrace(e.backtrace) if !response.code.empty? && response.code == 403.to_s - @log.info "Response Code #{response.code} Updating @last_post_attempt_time" + @log.info "Response Code #{response.code} for requestId: #{requestId} Updating @last_post_attempt_time" @last_post_attempt_time = Time.now @first_post_attempt_made = true # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue - @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" else # raise if the response code is non-400 - @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "HTTPServerException when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" raise e end # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes exception_aggregator(e) rescue Errno::ETIMEDOUT => e - @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}" + @log.info "Timed out when POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) raise e rescue Exception => e - @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}" + @log.info "Exception POSTing Metrics to MDM for requestId: #{requestId} exception: #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) raise e end From c594c5a3bac3d9202faf907a5f1dc9db7a8a30a3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 16 Feb 2022 22:56:18 -0800 Subject: [PATCH 35/65] latest mdsd and fix for threading issue in out mdm --- build/linux/installer/conf/kube.conf | 26 +++++++++++----------- charts/azuremonitor-containers/values.yaml | 2 +- kubernetes/linux/main.sh | 26 +++++++++++++++++----- kubernetes/linux/setup.sh | 5 +++-- kubernetes/omsagent.yaml | 6 +++-- source/plugins/ruby/out_mdm.rb | 24 +++++++++----------- 6 files changed, 51 insertions(+), 38 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index dcdf1cdf8..016a7942a 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -21,7 +21,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -54,7 +54,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -90,7 +90,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -116,7 +116,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -151,7 +151,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -182,7 +182,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -201,7 +201,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -236,7 +236,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -263,7 +263,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 50 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -350,7 +350,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -377,7 +377,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m @@ -403,7 +403,7 @@ # overflow_action drop_oldest_chunk # chunk_limit_size 4m # queue_limit_length 20 - # flush_interval 20s + # flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" # retry_max_times 10 # retry_wait 5s # retry_max_interval 5m @@ -422,7 +422,7 @@ overflow_action drop_oldest_chunk chunk_limit_size 4m queue_limit_length 20 - flush_interval 20s + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s retry_max_interval 5m diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index aa4c6bcf2..e15791d21 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -25,7 +25,7 @@ omsagent: tagWindows: "win-ciprod01312022" pullPolicy: IfNotPresent dockerProviderVersion: "16.0.0-0" - agentVersion: "azure-mdsd-1.14.2" + agentVersion: "azure-mdsd-1..17.0" winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 5f3c4c902..80d13805c 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -80,8 +80,9 @@ checkAgentOnboardingStatus() { fi } -configureFluentDWorkerIDsForRS() { +configureFluentDConfigForRS() { echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" + export FLUENTD_FLUSH_INTERVAL="20s" # default 20s, evaluate if required lower flush interval at high scale case $NUM_OF_FLUENTD_WORKERS in 5) export NUM_OF_FLUENTD_WORKERS=5 @@ -90,6 +91,7 @@ configureFluentDWorkerIDsForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + #export FLUENTD_FLUSH_INTERVAL="5s" ;; 4) export NUM_OF_FLUENTD_WORKERS=4 @@ -98,6 +100,7 @@ configureFluentDWorkerIDsForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + #export FLUENTD_FLUSH_INTERVAL="10s" ;; 3) export NUM_OF_FLUENTD_WORKERS=3 @@ -106,6 +109,7 @@ configureFluentDWorkerIDsForRS() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + #export FLUENTD_FLUSH_INTERVAL="15s" ;; 2) export NUM_OF_FLUENTD_WORKERS=2 @@ -114,6 +118,7 @@ configureFluentDWorkerIDsForRS() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + #export FLUENTD_FLUSH_INTERVAL="20s" ;; *) @@ -123,6 +128,7 @@ configureFluentDWorkerIDsForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 + #export FLUENTD_FLUSH_INTERVAL="20s" ;; esac echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc @@ -131,6 +137,9 @@ configureFluentDWorkerIDsForRS() { echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc + source ~/.bashrc echo "pod inventory worker id: ${FLUENTD_POD_INVENTORY_WORKER_ID}" @@ -138,6 +147,7 @@ configureFluentDWorkerIDsForRS() { echo "event inventory worker id: ${FLUENTD_EVENT_INVENTORY_WORKER_ID}" echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" + echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" } #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding @@ -264,9 +274,6 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >>~/.bashrc - #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD - export MDSD_ODS_COMPRESSION_LEVEL=0 - echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >>~/.bashrc fi if [ ! -z "$PROXY_ENDPOINT" ]; then @@ -616,6 +623,13 @@ else echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >>~/.bashrc export MDSD_FLUENT_SOCKET_PORT="29230" echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >>~/.bashrc + # set the libcurl specific env and configuration + export ENABLE_CURL_UPLOAD=true + echo "export ENABLE_CURL_UPLOAD=$ENABLE_CURL_UPLOAD" >> ~/.bashrc + export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + echo "export CURL_CA_BUNDLE=$CURL_CA_BUNDLE" >> ~/.bashrc + mkdir -p /etc/pki/tls/certs + cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt fi source ~/.bashrc @@ -651,8 +665,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "*** starting fluentd v1 in daemonset" fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else - echo "*** configure fluentd worker ids" - configureFluentDWorkerIDsForRS + echo "*** configure fluentd config" + configureFluentDConfigForRS echo "*** starting fluentd v1 in replicaset" fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 872ac99cf..aca05fc08 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,8 +9,9 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -#install oneagent - Official bits (10/7/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.2-build.master.284_x86_64.deb +#install oneagent - Official bits (02/15/2022) +wget https://github.com/microsoft/Docker-Provider/releases/download/1.17.0/azure-mdsd_1.17.0-build.master.352_x86_64.deb + /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index b0ccb6712..19f5afe92 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -357,7 +357,7 @@ spec: component: oms-agent tier: node annotations: - agentVersion: "azure-mdsd-1.14.2" + agentVersion: "azure-mdsd-1..17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -598,7 +598,7 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "azure-mdsd-1.14.2" + agentVersion: "azure-mdsd-1..17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -620,6 +620,8 @@ spec: resourceFieldRef: containerName: omsagent resource: limits.cpu + # - name: MONITORING_MAX_EVENT_RATE + # value: "50000" # default 20KPS for MDSD, for large cluster validate 50KPS - name: EMIT_CACHE_TELEMETRY value: "true" # enable only debug or test purpose and disable for prod - name: AKS_RESOURCE_ID diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index a542c5eb0..fb66ec158 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -43,7 +43,6 @@ def initialize @data_hash = {} @parsed_token_uri = nil - @http_client = nil @token_expiry_time = Time.now @cached_access_token = String.new @last_post_attempt_time = Time.now @@ -62,6 +61,7 @@ def initialize @mdm_exceptions_hash = {} @mdm_exceptions_count = 0 @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i + @proxy = nil end def configure(conf) @@ -97,18 +97,7 @@ def start end @@post_request_url = @@post_request_url_template % { aks_region: aks_region, aks_resource_id: aks_resource_id } @post_request_uri = URI.parse(@@post_request_url) - if (!!@isArcK8sCluster) - proxy = (ProxyUtils.getProxyConfiguration) - if proxy.nil? || proxy.empty? - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) - else - @log.info "Proxy configured on this cluster: #{aks_resource_id}" - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass]) - end - else - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) - end - @http_client.use_ssl = true + @proxy = (ProxyUtils.getProxyConfiguration) @log.info "POST Request url: #{@@post_request_url}" ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) @@ -327,6 +316,13 @@ def send_to_mdm(post_body) else access_token = get_access_token end + if @proxy.nil? || @proxy.empty? + http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + else + @log.info "Proxy configured on this cluster: #{aks_resource_id}" + http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) + end + http_client.use_ssl = true requestId = SecureRandom.uuid.to_s request = Net::HTTP::Post.new(@post_request_uri.request_uri) request["Content-Type"] = "application/x-ndjson" @@ -335,7 +331,7 @@ def send_to_mdm(post_body) request.body = post_body.join("\n") @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024} for requestId: #{requestId}" - response = @http_client.request(request) + response = http_client.request(request) response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code} for requestId: #{requestId}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now From 0297f7bf9e6b12077690d25946d8cef075f79e18 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 17 Feb 2022 23:42:23 -0800 Subject: [PATCH 36/65] rs specific config for large cluster --- build/linux/installer/conf/kube.conf | 24 +++++------ kubernetes/linux/main.sh | 41 ++++++++++++++----- .../plugins/ruby/in_kube_podmdminventory.rb | 4 +- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 016a7942a..50a917631 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -20,7 +20,7 @@ @type file overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -53,7 +53,7 @@ @type file overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -89,7 +89,7 @@ path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -115,7 +115,7 @@ path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -150,7 +150,7 @@ path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -181,7 +181,7 @@ path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -200,7 +200,7 @@ path /var/opt/microsoft/docker-cimprov/state/out_mdm_nodeinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -235,7 +235,7 @@ path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -262,7 +262,7 @@ path /var/opt/microsoft/docker-cimprov/state/out_mdm_podinventory*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 50 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -349,7 +349,7 @@ path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -376,7 +376,7 @@ path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s @@ -421,7 +421,7 @@ path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length 20 + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 80d13805c..752b85440 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -80,9 +80,10 @@ checkAgentOnboardingStatus() { fi } -configureFluentDConfigForRS() { +setReplicaSetSpecificConfig() { echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" - export FLUENTD_FLUSH_INTERVAL="20s" # default 20s, evaluate if required lower flush interval at high scale + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default case $NUM_OF_FLUENTD_WORKERS in 5) export NUM_OF_FLUENTD_WORKERS=5 @@ -91,7 +92,9 @@ configureFluentDConfigForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - #export FLUENTD_FLUSH_INTERVAL="5s" + export FLUENTD_FLUSH_INTERVAL="5s" + export FLUENTD_QUEUE_LIMIT_LENGTH="50" + export MONITORING_MAX_EVENT_RATE="50000" # default MDSD EPS is 20K which is not enough for large scale ;; 4) export NUM_OF_FLUENTD_WORKERS=4 @@ -100,7 +103,9 @@ configureFluentDConfigForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=1 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - #export FLUENTD_FLUSH_INTERVAL="10s" + export FLUENTD_FLUSH_INTERVAL="10s" + export FLUENTD_QUEUE_LIMIT_LENGTH="40" + export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale ;; 3) export NUM_OF_FLUENTD_WORKERS=3 @@ -109,7 +114,9 @@ configureFluentDConfigForRS() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - #export FLUENTD_FLUSH_INTERVAL="15s" + export FLUENTD_FLUSH_INTERVAL="15s" + export FLUENTD_QUEUE_LIMIT_LENGTH="30" + export MONITORING_MAX_EVENT_RATE="30000" # default MDSD EPS is 20K which is not enough for large scale ;; 2) export NUM_OF_FLUENTD_WORKERS=2 @@ -118,7 +125,9 @@ configureFluentDConfigForRS() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - #export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" + export MONITORING_MAX_EVENT_RATE="25000" # default MDSD EPS is 20K which is not enough for large scale ;; *) @@ -128,7 +137,8 @@ configureFluentDConfigForRS() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 - #export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_FLUSH_INTERVAL="20s" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" ;; esac echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc @@ -139,6 +149,12 @@ configureFluentDConfigForRS() { echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc + echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc + + if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then + echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc + echo "Configured MDSD Max EPS is: ${MONITORING_MAX_EVENT_RATE}" + fi source ~/.bashrc @@ -148,6 +164,7 @@ configureFluentDConfigForRS() { echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" + echo "fluentd buffer plugin queue length=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc } #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding @@ -581,6 +598,10 @@ echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >>~/.bashrc +if [ "${CONTROLLER_TYPE}" == "ReplicaSet" ]; then + echo "*** set applicable replicaset config ***" + setReplicaSetSpecificConfig +fi #skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >>~/.bashrc @@ -665,10 +686,8 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then echo "*** starting fluentd v1 in daemonset" fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else - echo "*** configure fluentd config" - configureFluentDConfigForRS - echo "*** starting fluentd v1 in replicaset" - fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & + echo "*** starting fluentd v1 in replicaset" + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi fi diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 98f06dc0c..714d78e07 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -112,9 +112,9 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) end end end - @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm" + @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}" mdm_pod_inventory_es = Fluent::MultiEventStream.new pod_inventory_mdm_records.each { |pod_inventory_mdm_record| mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record From 89a96da17529cb20bd1c368a20165cbfd113c61a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 21 Feb 2022 18:33:28 -0800 Subject: [PATCH 37/65] optimize out mdm --- build/linux/installer/conf/kube.conf | 2 +- kubernetes/linux/main.sh | 25 +++++++++++-------- source/plugins/ruby/constants.rb | 2 ++ .../plugins/ruby/in_kube_podmdminventory.rb | 21 +++++++++------- source/plugins/ruby/out_mdm.rb | 4 +++ source/plugins/ruby/podinventory_to_mdm.rb | 2 +- 6 files changed, 34 insertions(+), 22 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 50a917631..1a566ec28 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -267,7 +267,7 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count "#{ENV['FLUENTD_MDM_FLUSH_THREAD_COUNT']}" retry_mdm_post_wait_minutes 30 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 752b85440..0f113905c 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -84,6 +84,7 @@ setReplicaSetSpecificConfig() { echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" export FLUENTD_FLUSH_INTERVAL="20s" export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default + export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default case $NUM_OF_FLUENTD_WORKERS in 5) export NUM_OF_FLUENTD_WORKERS=5 @@ -93,8 +94,9 @@ setReplicaSetSpecificConfig() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="5s" - export FLUENTD_QUEUE_LIMIT_LENGTH="50" - export MONITORING_MAX_EVENT_RATE="50000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_QUEUE_LIMIT_LENGTH="60" + export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker ;; 4) export NUM_OF_FLUENTD_WORKERS=4 @@ -104,8 +106,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="10s" - export FLUENTD_QUEUE_LIMIT_LENGTH="40" - export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_QUEUE_LIMIT_LENGTH="50" + export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale ;; 3) export NUM_OF_FLUENTD_WORKERS=3 @@ -115,8 +117,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="15s" - export FLUENTD_QUEUE_LIMIT_LENGTH="30" - export MONITORING_MAX_EVENT_RATE="30000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_QUEUE_LIMIT_LENGTH="40" + export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale ;; 2) export NUM_OF_FLUENTD_WORKERS=2 @@ -126,8 +128,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="20s" - export FLUENTD_QUEUE_LIMIT_LENGTH="20" - export MONITORING_MAX_EVENT_RATE="25000" # default MDSD EPS is 20K which is not enough for large scale + export FLUENTD_QUEUE_LIMIT_LENGTH="30" + export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale ;; *) @@ -145,11 +147,11 @@ setReplicaSetSpecificConfig() { echo "export FLUENTD_POD_INVENTORY_WORKER_ID=$FLUENTD_POD_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_NODE_INVENTORY_WORKER_ID=$FLUENTD_NODE_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_EVENT_INVENTORY_WORKER_ID=$FLUENTD_EVENT_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc - echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc + echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc + echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc if [ ! -z $MONITORING_MAX_EVENT_RATE ]; then echo "export MONITORING_MAX_EVENT_RATE=$MONITORING_MAX_EVENT_RATE" >>~/.bashrc @@ -164,7 +166,8 @@ setReplicaSetSpecificConfig() { echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" - echo "fluentd buffer plugin queue length=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc + echo "fluentd out mdm flush thread count: $FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc + echo "fluentd buffer plugin queue length: $FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc } #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index ca966fb12..883c6d15f 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -141,4 +141,6 @@ class Constants MDM_POD_INVENTORY_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/MDMPodInventoryState.json" # FileName for NodeAllocatable Records state NODE_ALLOCATABLE_RECORDS_STATE_FILE = "/var/opt/microsoft/docker-cimprov/state/NodeAllocatableRecords.json" + # Emit Stream size for Pod MDM metric + POD_MDM_EMIT_STREAM_BATCH_SIZE = 5000 # each record is 200 bytes, 5k records ~2MB end diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 714d78e07..76b0eed0f 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -115,18 +115,21 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) @log.info "in_kube_podmdminventory:parse_and_emit_records:Sending pod inventory mdm records to out_mdm @ #{Time.now.utc.iso8601}" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "in_kube_podmdminventory:parse_and_emit_records:pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size} @ #{Time.now.utc.iso8601}" - mdm_pod_inventory_es = Fluent::MultiEventStream.new - pod_inventory_mdm_records.each { |pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - if mdm_pod_inventory_es.count >= 5000 # 5k records of MDM is ~2MB and each record is ~400 bytes + if !pod_inventory_mdm_records.nil? && pod_inventory_mdm_records.length > 0 + startTime = (Time.now.to_f * 1000).to_i + recordCount = pod_inventory_mdm_records.length + while recordCount > 0 + record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) + time_array = Array.new(records.length) { batchTime } + mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array) router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) - mdm_pod_inventory_es = Fluent::MultiEventStream.new + pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) + recordCount = pod_inventory_mdm_records.length + time_array = nil end - } if pod_inventory_mdm_records - if mdm_pod_inventory_es.count > 0 - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) + flushTimeMs = (Time.now.to_f * 1000).to_i - startTime + @log.info "in_kube_podmdminventory:parse_and_emit_records:timetaken to flush all Pod MDM records: #{flushTimeMs} @ #{Time.now.utc.iso8601}" end - mdm_pod_inventory_es = nil end rescue => errorStr $log.warn "in_kube_podmdminventory:parse_and_emit_records: failed with an error #{errorStr}" diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index fb66ec158..dd60a250b 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -141,6 +141,10 @@ def start end end + def multi_workers_ready? + return true + end + # get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired def get_access_token if (Time.now > @get_access_token_backoff_expiry) diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 278632cb0..a7f9c5435 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -129,7 +129,7 @@ def get_pod_inventory_mdm_records(batch_time) controllerNameDimValue: podControllerNameDimValue, podCountMetricValue: value, } - records.push(JSON.parse(record)) + records.push(Yajl::Parser.parse(record)) } #Add pod metric records From a6d04c5815057ae6ca1ce66b475b2d6efd8cb3aa Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 21 Feb 2022 21:18:07 -0800 Subject: [PATCH 38/65] bug fix --- kubernetes/linux/main.sh | 4 ++-- source/plugins/ruby/in_kube_podinventory.rb | 21 +++++++++++-------- .../plugins/ruby/in_kube_podmdminventory.rb | 14 +++++++++---- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 0f113905c..701b63b17 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -166,8 +166,8 @@ setReplicaSetSpecificConfig() { echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" - echo "fluentd out mdm flush thread count: $FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc - echo "fluentd buffer plugin queue length: $FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc + echo "fluentd buffer plugin queue length: ${FLUENTD_QUEUE_LIMIT_LENGTH}" + echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}" } #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 455444f85..28d11e29a 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -26,6 +26,7 @@ def initialize require_relative "omslog" require_relative "constants" require_relative "extension_utils" + require_relative "CustomMetricsUtils" # refer tomlparser-agent-config for updating defaults # this configurable via configmap @@ -348,16 +349,18 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send - begin - if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 - mdmPodRecordsJson = @mdmPodRecords.to_json - @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" - @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" - writeMDMRecords(mdmPodRecordsJson) - @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" + if CustomMetricsUtils.check_custom_metrics_availability + begin + if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 + mdmPodRecordsJson = @mdmPodRecords.to_json + @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" + @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" + writeMDMRecords(mdmPodRecordsJson) + @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" + end + rescue => err + @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}" end - rescue => err - @log.warn "in_kube_podinventory::parse_and_emit_records: failed to write MDMRecords with an error: #{err} @ #{Time.now.utc.iso8601}" end end diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 76b0eed0f..8272420c3 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -24,6 +24,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "CustomMetricsUtils" end config_param :run_interval, :time, :default => 60 @@ -37,6 +38,7 @@ def start if @run_interval super $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}") + @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -57,9 +59,13 @@ def shutdown def enumerate begin - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - parse_and_emit_records(batchTime) + if !@isCustomMetricsAvailability + $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region" + else + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + parse_and_emit_records(batchTime) + end rescue => errorStr $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -120,7 +126,7 @@ def parse_and_emit_records(batchTime = Time.utc.iso8601) recordCount = pod_inventory_mdm_records.length while recordCount > 0 record_array = pod_inventory_mdm_records.take(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) - time_array = Array.new(records.length) { batchTime } + time_array = Array.new(record_array.length) { batchTime } mdm_pod_inventory_es = Fluent::MultiEventStream.new(time_array, record_array) router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) pod_inventory_mdm_records = pod_inventory_mdm_records.drop(Constants::POD_MDM_EMIT_STREAM_BATCH_SIZE) From 4c4d2e6b3f00c15f8e7a8e56a89cffec3e71e995 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 22 Feb 2022 17:55:51 -0800 Subject: [PATCH 39/65] use large queue limit for kube perf --- build/linux/installer/conf/kube.conf | 2 +- kubernetes/linux/main.sh | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 1a566ec28..1b68f990e 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -20,7 +20,7 @@ @type file overflow_action drop_oldest_chunk chunk_limit_size 4m - queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + queue_limit_length "#{ENV['FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH']}" flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" retry_max_times 10 retry_wait 5s diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 701b63b17..fb824c6bb 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -84,6 +84,7 @@ setReplicaSetSpecificConfig() { echo "num of fluentd workers:${NUM_OF_FLUENTD_WORKERS}" export FLUENTD_FLUSH_INTERVAL="20s" export FLUENTD_QUEUE_LIMIT_LENGTH="20" # default + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20" export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default case $NUM_OF_FLUENTD_WORKERS in 5) @@ -94,7 +95,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=1 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="5s" - export FLUENTD_QUEUE_LIMIT_LENGTH="60" + export FLUENTD_QUEUE_LIMIT_LENGTH="50" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="100" # kube perf is high volume so would need large queue limit to avoid data loss export MONITORING_MAX_EVENT_RATE="100000" # default MDSD EPS is 20K which is not enough for large scale export FLUENTD_MDM_FLUSH_THREAD_COUNT="20" # if the pod mdm inventory running on separate worker ;; @@ -106,7 +108,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="10s" - export FLUENTD_QUEUE_LIMIT_LENGTH="50" + export FLUENTD_QUEUE_LIMIT_LENGTH="40" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="80" # kube perf is high volume so would need large queue limit export MONITORING_MAX_EVENT_RATE="80000" # default MDSD EPS is 20K which is not enough for large scale ;; 3) @@ -117,7 +120,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="15s" - export FLUENTD_QUEUE_LIMIT_LENGTH="40" + export FLUENTD_QUEUE_LIMIT_LENGTH="30" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="60" # kube perf is high volume so would need large queue limit export MONITORING_MAX_EVENT_RATE="60000" # default MDSD EPS is 20K which is not enough for large scale ;; 2) @@ -128,7 +132,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_EVENT_INVENTORY_WORKER_ID=0 export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="20s" - export FLUENTD_QUEUE_LIMIT_LENGTH="30" + export FLUENTD_QUEUE_LIMIT_LENGTH="20" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="40" # kube perf is high volume so would need large queue limit export MONITORING_MAX_EVENT_RATE="40000" # default MDSD EPS is 20K which is not enough for large scale ;; @@ -141,6 +146,7 @@ setReplicaSetSpecificConfig() { export FLUENTD_OTHER_INVENTORY_WORKER_ID=0 export FLUENTD_FLUSH_INTERVAL="20s" export FLUENTD_QUEUE_LIMIT_LENGTH="20" + export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20" ;; esac echo "export NUM_OF_FLUENTD_WORKERS=$NUM_OF_FLUENTD_WORKERS" >>~/.bashrc @@ -150,6 +156,7 @@ setReplicaSetSpecificConfig() { echo "export FLUENTD_POD_MDM_INVENTORY_WORKER_ID=$FLUENTD_POD_MDM_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_OTHER_INVENTORY_WORKER_ID=$FLUENTD_OTHER_INVENTORY_WORKER_ID" >>~/.bashrc echo "export FLUENTD_FLUSH_INTERVAL=$FLUENTD_FLUSH_INTERVAL" >>~/.bashrc + echo "export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH=$FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH" >>~/.bashrc echo "export FLUENTD_QUEUE_LIMIT_LENGTH=$FLUENTD_QUEUE_LIMIT_LENGTH" >>~/.bashrc echo "export FLUENTD_MDM_FLUSH_THREAD_COUNT=$FLUENTD_MDM_FLUSH_THREAD_COUNT" >>~/.bashrc @@ -166,7 +173,8 @@ setReplicaSetSpecificConfig() { echo "pod mdm inventory worker id: ${FLUENTD_POD_MDM_INVENTORY_WORKER_ID}" echo "other inventory worker id: ${FLUENTD_OTHER_INVENTORY_WORKER_ID}" echo "fluentd flush interval: ${FLUENTD_FLUSH_INTERVAL}" - echo "fluentd buffer plugin queue length: ${FLUENTD_QUEUE_LIMIT_LENGTH}" + echo "fluentd kube perf buffer plugin queue length: ${FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH}" + echo "fluentd buffer plugin queue length for all other non kube perf plugin: ${FLUENTD_QUEUE_LIMIT_LENGTH}" echo "fluentd out mdm flush thread count: ${FLUENTD_MDM_FLUSH_THREAD_COUNT}" } From 333cd8081d27495451c5518bd7f6a76a01ce52d0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 22 Feb 2022 19:00:32 -0800 Subject: [PATCH 40/65] 5k preview rs limits --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 19f5afe92..88ad931b1 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -610,7 +610,7 @@ spec: resources: limits: cpu: 5 - memory: 2Gi + memory: 5Gi requests: cpu: 150m memory: 250Mi From fb57c3c406ebffaa2fe47ce784f4f8b5c612f8fc Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 23 Feb 2022 16:47:53 -0800 Subject: [PATCH 41/65] handle resourceversion empty or 0 scenrio --- source/plugins/ruby/in_kube_nodes.rb | 117 +++++---- source/plugins/ruby/in_kube_perfinventory.rb | 118 +++++---- source/plugins/ruby/in_kube_podinventory.rb | 257 ++++++++++--------- 3 files changed, 264 insertions(+), 228 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index d3077e713..8ee2e5fc2 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -657,68 +657,77 @@ def watch_nodes end end end - begin - $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - nodesResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - if !nodeItem.nil? && !nodeItem.empty? + if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion: #{nodesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem + @nodeItemsCache.delete(key) } - else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - @nodeCacheMutex.synchronize { - @nodeItemsCache.delete(key) - } end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + break end - when "ERROR" - nodesResourceVersion = nil - $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection - # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end rescue => errorStr $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 00f7b02db..b6abbc263 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -305,69 +305,77 @@ def watch_pods end end end - begin - $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - podsResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - podsResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) - if !podItem.nil? && !podItem.empty? + if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_perfinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_perfinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_perfinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_perfinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_perfinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? @podCacheMutex.synchronize { - @podItemsCache[key] = podItem + @podItemsCache.delete(key) } - else - $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - @podCacheMutex.synchronize { - @podItemsCache.delete(key) - } end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + podsResourceVersion = nil + $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end - when "ERROR" - podsResourceVersion = nil - $log.warn("in_kube_perfinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end + $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end - $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity more than readtimeout value used in the connection - # $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_perfinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - podsResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end rescue => errorStr $log.warn("in_kube_perfinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 28d11e29a..c44943b0e 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -787,81 +787,91 @@ def watch_pods end end end - begin - $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - podsResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - podsResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - currentWindowsNodeNameList = [] - @windowsNodeNameCacheMutex.synchronize { - currentWindowsNodeNameList = @windowsNodeNameListCache.dup - } - isWindowsPodItem = false - nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" - if !nodeName.empty? && - !currentWindowsNodeNameList.nil? && - !currentWindowsNodeNameList.empty? && - currentWindowsNodeNameList.include?(nodeName) - isWindowsPodItem = true + if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("pods", resource_version: podsResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_pods:watch API returned nil watcher for watch connection with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + podsResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_pods:received event type: #{notice["type"]} with resource version: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_pods:received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + currentWindowsNodeNameList = [] + @windowsNodeNameCacheMutex.synchronize { + currentWindowsNodeNameList = @windowsNodeNameListCache.dup + } + isWindowsPodItem = false + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" end - podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) - if !podItem.nil? && !podItem.empty? + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? @podCacheMutex.synchronize { - @podItemsCache[key] = podItem + @podItemsCache.delete(key) } - else - $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - @podCacheMutex.synchronize { - @podItemsCache.delete(key) - } end + when "ERROR" + podsResourceVersion = nil + $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + # enforce LIST again otherwise cause inconsistency by skipping a potential RV with valid data! + podsResourceVersion = nil + break end - when "ERROR" - podsResourceVersion = nil - $log.warn("in_kube_podinventory::watch_pods:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_podinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end + $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") end - $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection - # $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - podsResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end rescue => errorStr $log.warn("in_kube_podinventory::watch_pods:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") @@ -915,67 +925,76 @@ def watch_services serviceInventory = nil end end - begin - $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - servicesResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - servicesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - serviceItem = KubernetesApiClient.getOptimizedItem("services", item) - if !serviceItem.nil? && !serviceItem.empty? + if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion: #{servicesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("services", resource_version: servicesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_services:watch API returned nil watcher for watch connection with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + servicesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_services: received event type: #{notice["type"]} with resource version: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_services: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" + end + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? @serviceCacheMutex.synchronize { - @serviceItemsCache[key] = serviceItem + @serviceItemsCache.delete(key) } - else - $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - @serviceCacheMutex.synchronize { - @serviceItemsCache.delete(key) - } end + when "ERROR" + servicesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + servicesResourceVersion = nil + $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + break end - when "ERROR" - servicesResourceVersion = nil - $log.warn("in_kube_podinventory::watch_services:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_podinventory::watch_services:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + rescue Net::ReadTimeout => errorStr + # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + servicesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - # $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - servicesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end rescue => errorStr $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") From 86f088e99a69af374ebaeab7733458aa6e9a0bbb Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 25 Feb 2022 00:18:27 -0800 Subject: [PATCH 42/65] handle pagination api call failures --- source/plugins/ruby/KubernetesApiClient.rb | 62 +++- source/plugins/ruby/in_kube_nodes.rb | 70 ++-- source/plugins/ruby/in_kube_perfinventory.rb | 74 +++-- source/plugins/ruby/in_kube_podinventory.rb | 317 ++++++++++--------- 4 files changed, 318 insertions(+), 205 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 0d4267685..7f8cd0498 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -75,6 +75,39 @@ def getKubeResourceInfo(resource, api_group: nil) return response end + def getKubeResourceInfoV2(resource, api_group: nil) + headers = {} + response = nil + responseCode = nil + @Log.info "Getting Kube resource: #{resource}" + begin + resourceUri = getResourceUri(resource, api_group) + if !resourceUri.nil? + uri = URI.parse(resourceUri) + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + else + Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http| + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + responseCode = response.code + @Log.info "KubernetesAPIClient::getKubeResourceInfoV2 : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" + end + end + end + rescue => error + @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") + end + if (!response.nil?) + if (!response.body.nil? && response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfoV2 : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end + end + return responseCode, response + end + def getTokenStr return @@TokenStr if !@@TokenStr.nil? begin @@ -759,12 +792,37 @@ def getMetricNumericValue(metricName, metricVal) return metricValue end # getMetricNumericValue + def getResourcesAndContinuationTokenV2(uri, api_group: nil) + continuationToken = nil + resourceInventory = nil + responseCode = nil + begin + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + responseCode, resourceInfo = getKubeResourceInfoV2(uri, api_group: api_group) + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2 : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + if !responseCode.nil? && responseCode == "200" && !resourceInfo.nil? + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body)) + @Log.info "KubernetesApiClient::getResourcesAndContinuationTokenV2:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInfo = nil + end + if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?) + continuationToken = resourceInventory["metadata"]["continue"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getResourcesAndContinuationTokenV2:Failed in get resources for #{uri} and continuation token: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + resourceInventory = nil + end + return continuationToken, resourceInventory, responseCode + end #getResourcesAndContinuationTokenV2 + def getResourcesAndContinuationToken(uri, api_group: nil) continuationToken = nil resourceInventory = nil begin @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" - resourceInfo = getKubeResourceInfo(uri, api_group: api_group) + responseCode, resourceInfo = getKubeResourceInfo(uri, api_group: api_group) @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" if !resourceInfo.nil? @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" @@ -1107,7 +1165,7 @@ def getPodOptimizedItem(resourceItem, isWindowsPodItem) currentContainerStatus["restartCount"] = containerStatus["restartCount"] currentContainerStatus["state"] = containerStatus["state"] currentContainerStatus["lastState"] = containerStatus["lastState"] - if isWindowsPod + if isWindowsPodItem currentContainerStatus["imageID"] = containerStatus["imageID"] end item["status"]["initContainerStatuses"].push(currentContainerStatus) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 8ee2e5fc2..121b1804f 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -603,39 +603,17 @@ def watch_nodes @nodeItemsCache.clear() } continuationToken = nil - $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - if !nodeItem.nil? && !nodeItem.empty? - @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem - } - else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end + $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" - end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty?) nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? @@ -653,13 +631,45 @@ def watch_nodes end end else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again + break + else + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + end end end end if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" # https://github.com/kubernetes/kubernetes/issues/74022 - $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion: #{nodesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") nodesResourceVersion = nil # for the LIST to happen again sleep(30) # do not overwhelm the api-server if api-server broken else diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index b6abbc263..50552a25d 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -252,38 +252,17 @@ def watch_pods @podItemsCache.clear() } continuationToken = nil - $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") - $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - if (!podInventory.nil? && !podInventory.empty?) - podsResourceVersion = podInventory["metadata"]["resourceVersion"] - if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - podInventory["items"].each do |item| - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) - if !podItem.nil? && !podItem.empty? - @podCacheMutex.synchronize { - @podItemsCache[key] = podItem - } - else - $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}" + $log.info("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") else - $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory" - end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + $log.info("in_kube_perfinventory::watch_pods:Done getting pods from Kube API:#{resourceUri} @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty?) podsResourceVersion = podInventory["metadata"]["resourceVersion"] if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? @@ -293,7 +272,7 @@ def watch_pods @podItemsCache[key] = podItem } else - $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_perfinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" end else $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" @@ -301,7 +280,40 @@ def watch_pods end end else - $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}" + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_perfinventory::watch_pods:Getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + break # break, if any of the pagination call failed so that full cache will rebuild with LIST again + else + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_perfinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + podItem = KubernetesApiClient.getOptimizedItem("pods-perf", item) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_perfinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_perfinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + end + end end end end @@ -364,7 +376,7 @@ def watch_pods $log.warn("in_kube_perfinventory::watch_pods:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end - $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_perfinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}") end rescue Net::ReadTimeout => errorStr ## This expected if there is no activity more than readtimeout value used in the connection diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index c44943b0e..68704c4d3 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -718,46 +718,17 @@ def watch_pods currentWindowsNodeNameList = @windowsNodeNameListCache.dup } continuationToken = nil - $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API since podsResourceVersion is #{podsResourceVersion} @ #{Time.now.utc.iso8601}") - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") - $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - if (!podInventory.nil? && !podInventory.empty?) - podsResourceVersion = podInventory["metadata"]["resourceVersion"] - if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - podInventory["items"].each do |item| - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" - isWindowsPodItem = false - if !nodeName.empty? && - !currentWindowsNodeNameList.nil? && - !currentWindowsNodeNameList.empty? && - currentWindowsNodeNameList.include?(nodeName) - isWindowsPodItem = true - end - podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) - if !podItem.nil? && !podItem.empty? - @podCacheMutex.synchronize { - @podItemsCache[key] = podItem - } - else - $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" - end - else - $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}" + $log.info("in_kube_podinventory::watch_pods:Getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") else - $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" - end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + $log.info("in_kube_podinventory::watch_pods:Done getting pods from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty?) podsResourceVersion = podInventory["metadata"]["resourceVersion"] if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") podInventory["items"].each do |item| key = item["metadata"]["uid"] if !key.nil? && !key.empty? @@ -775,7 +746,7 @@ def watch_pods @podItemsCache[key] = podItem } else - $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_pods:Received podItem either empty or nil @ #{Time.now.utc.iso8601}" end else $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" @@ -783,15 +754,56 @@ def watch_pods end end else - $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory" + end + while (!continuationToken.nil? && !continuationToken.empty?) + resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}" + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + podsResourceVersion = nil + break # break, if any of the pagination call failed so that full cache will rebuild with LIST again + else + if (!podInventory.nil? && !podInventory.empty?) + podsResourceVersion = podInventory["metadata"]["resourceVersion"] + if (podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_pods:number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + podInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeName = (!item["spec"].nil? && !item["spec"]["nodeName"].nil?) ? item["spec"]["nodeName"] : "" + isWindowsPodItem = false + if !nodeName.empty? && + !currentWindowsNodeNameList.nil? && + !currentWindowsNodeNameList.empty? && + currentWindowsNodeNameList.include?(nodeName) + isWindowsPodItem = true + end + podItem = KubernetesApiClient.getOptimizedItem("pods", item, isWindowsPodItem) + if !podItem.nil? && !podItem.empty? + @podCacheMutex.synchronize { + @podItemsCache[key] = podItem + } + else + $log.warn "in_kube_podinventory::watch_pods:Received podItem is empty or nil @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_podinventory::watch_pods:Received poduid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_pods:Received empty podInventory @ #{Time.now.utc.iso8601}" + end + end end end end if podsResourceVersion.nil? || podsResourceVersion.empty? || podsResourceVersion == "0" # https://github.com/kubernetes/kubernetes/issues/74022 - $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion: #{podsResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_pods:received podsResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") podsResourceVersion = nil # for the LIST to happen again - sleep(30) # do not overwhelm the api-server if api-server broken + sleep(30) # do not overwhelm the api-server if api-server down else begin $log.info("in_kube_podinventory::watch_pods:Establishing Watch connection for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") @@ -860,7 +872,7 @@ def watch_pods break end end - $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods with resourceversion: #{podsResourceVersion} @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_pods:Watch connection got disconnected for pods @ #{Time.now.utc.iso8601}") end rescue Net::ReadTimeout => errorStr ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection @@ -868,7 +880,7 @@ def watch_pods rescue => errorStr $log.warn("in_kube_podinventory::watch_pods:Watch failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") podsResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken + sleep(5) # do not overwhelm the api-server if api-server down ensure watcher.finish if watcher end @@ -892,44 +904,48 @@ def watch_services @serviceItemsCache.clear() } $log.info("in_kube_podinventory::watch_services:Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}") - if !serviceInfo.nil? - $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) - $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") - serviceInfo = nil - if (!serviceInventory.nil? && !serviceInventory.empty?) - servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"] - if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") - serviceInventory["items"].each do |item| - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - serviceItem = KubernetesApiClient.getOptimizedItem("services", item) - if !serviceItem.nil? && !serviceItem.empty? - @serviceCacheMutex.synchronize { - @serviceItemsCache[key] = serviceItem - } + responseCode, serviceInfo = KubernetesApiClient.getKubeResourceInfoV2("services") + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_services:Getting services from Kube API failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_podinventory::watch_services: Done getting services from Kube API @ #{Time.now.utc.iso8601}") + if !serviceInfo.nil? + $log.info("in_kube_podinventory::watch_services:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInventory = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) + $log.info("in_kube_podinventory::watch_services:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInfo = nil + if (!serviceInventory.nil? && !serviceInventory.empty?) + servicesResourceVersion = serviceInventory["metadata"]["resourceVersion"] + if (serviceInventory.key?("items") && !serviceInventory["items"].nil? && !serviceInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_services:number of service items #{serviceInventory["items"].length} @ #{Time.now.utc.iso8601}") + serviceInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + serviceItem = KubernetesApiClient.getOptimizedItem("services", item) + if !serviceItem.nil? && !serviceItem.empty? + @serviceCacheMutex.synchronize { + @serviceItemsCache[key] = serviceItem + } + else + $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" + end else - $log.warn "in_kube_podinventory::watch_services:Received serviceItem either nil or empty @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_podinventory::watch_services:Received serviceuid either nil or empty @ #{Time.now.utc.iso8601}" end end + else + $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_podinventory::watch_services:Received empty serviceInventory @ #{Time.now.utc.iso8601}" + serviceInventory = nil end - serviceInventory = nil end end if servicesResourceVersion.nil? || servicesResourceVersion == "" || servicesResourceVersion == "0" # https://github.com/kubernetes/kubernetes/issues/74022 - $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion: #{servicesResourceVersion} either nil or empty or 0 @ #{Time.now.utc.iso8601}") + $log.warn("in_kube_podinventory::watch_services:received servicesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil # for the LIST to happen again - sleep(30) # do not overwhelm the api-server if api-server broken + sleep(30) # do not overwhelm the api-server if api-server down else begin $log.info("in_kube_podinventory::watch_services:Establishing Watch connection for services with resourceversion: #{servicesResourceVersion} @ #{Time.now.utc.iso8601}") @@ -991,7 +1007,7 @@ def watch_services rescue => errorStr $log.warn("in_kube_podinventory::watch_services:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") servicesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken + sleep(5) # do not overwhelm the api-server if api-server down ensure watcher.finish if watcher end @@ -1014,36 +1030,17 @@ def watch_windows_nodes @windowsNodeNameListCache.clear() } continuationToken = nil - $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API since nodesResourceVersion is #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows&limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}") - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| - key = item["metadata"]["name"] - if !key.nil? && !key.empty? - @windowsNodeNameCacheMutex.synchronize { - if !@windowsNodeNameListCache.include?(key) - @windowsNodeNameListCache.push(key) - end - } - else - $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" - end - end - end + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") else - $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" - end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + $log.info("in_kube_podinventory::watch_windows_nodes:Done getting windows nodes from Kube API @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty?) nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::watch_windows_nodes: number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") nodeInventory["items"].each do |item| key = item["metadata"]["name"] if !key.nil? && !key.empty? @@ -1060,61 +1057,97 @@ def watch_windows_nodes else $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") + if responseCode.nil? || responseCode != "200" + $log.info("in_kube_podinventory::watch_windows_nodes:Getting windows nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + break # break, if any of the pagination call failed so that full cache can be rebuild with LIST again + else + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_podinventory::watch_windows_nodes : number of windows node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["name"] + if !key.nil? && !key.empty? + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received node name either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_podinventory::watch_windows_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + end + end end end - begin - $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - nodesResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_podinventory::watch_windows_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server down + else + begin + $log.info("in_kube_podinventory::watch_windows_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", label_selector: "kubernetes.io/os=windows", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_podinventory::watch_windows_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_podinventory::watch_windows_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.warn("in_kube_podinventory::watch_windows_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name + key = item["metadata"]["name"] + @windowsNodeNameCacheMutex.synchronize { + if !@windowsNodeNameListCache.include?(key) + @windowsNodeNameListCache.push(key) + end + } + elsif notice["type"] == "DELETED" + key = item["metadata"]["name"] + @windowsNodeNameCacheMutex.synchronize { + @windowsNodeNameListCache.delete(key) + } + end + when "ERROR" nodesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") break + else + $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end - if notice["type"] == "ADDED" # we dont need to worry about modified event since we only need node name - key = item["metadata"]["name"] - @windowsNodeNameCacheMutex.synchronize { - if !@windowsNodeNameListCache.include?(key) - @windowsNodeNameListCache.push(key) - end - } - elsif notice["type"] == "DELETED" - key = item["metadata"]["name"] - @windowsNodeNameCacheMutex.synchronize { - @windowsNodeNameListCache.delete(key) - } - end - when "ERROR" - nodesResourceVersion = nil - $log.warn("in_kube_podinventory::watch_windows_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - $log.warn("in_kube_podinventory::watch_windows_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") end end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity more than readtimeout value used in the connection + # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity more than readtimeout value used in the connection - # $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end rescue => errorStr $log.warn("in_kube_podinventory::watch_windows_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") From 351f0ff0192d6abce97ba32d8177199c669053a2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 25 Feb 2022 00:29:55 -0800 Subject: [PATCH 43/65] fix bug --- source/plugins/ruby/in_kube_podinventory.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 68704c4d3..326c85895 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -758,7 +758,7 @@ def watch_pods end while (!continuationToken.nil? && !continuationToken.empty?) resourceUri = "pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}" - continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + continuationToken, podInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) if responseCode.nil? || responseCode != "200" $log.warn("in_kube_podinventory::watch_pods: getting pods from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") podsResourceVersion = nil From 497bce47095ce8394b5f791176c6cf4f984b20e0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Mar 2022 16:08:07 -0800 Subject: [PATCH 44/65] preview image for internal customer validation --- charts/azuremonitor-containers/values.yaml | 4 ++-- kubernetes/omsagent.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index e15791d21..9c9a0f195 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,11 +21,11 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod01312022" + tag: "ciprodpreview03012022" tagWindows: "win-ciprod01312022" pullPolicy: IfNotPresent dockerProviderVersion: "16.0.0-0" - agentVersion: "azure-mdsd-1..17.0" + agentVersion: "azure-mdsd-1.17.0" winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 88ad931b1..6e6d44d51 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -357,7 +357,7 @@ spec: component: oms-agent tier: node annotations: - agentVersion: "azure-mdsd-1..17.0" + agentVersion: "azure-mdsd-1.17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -598,7 +598,7 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "azure-mdsd-1..17.0" + agentVersion: "azure-mdsd-1.17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: From 911be7eb8758d9fac970d04776a0e2ef4f49c603 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 4 Mar 2022 19:58:29 -0800 Subject: [PATCH 45/65] preview image --- kubernetes/linux/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index f3a9efd7a..ad94f001e 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,8 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod01312022 +# TODO - revert to PROD version when PR gets merge +ARG IMAGE_TAG=ciprodpreview03012022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} From cec11dd4cd15253de61152914cd4da8b605b49f9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 6 Mar 2022 21:41:07 -0800 Subject: [PATCH 46/65] wip --- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index d39cedde0..e1b9df93a 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -15,16 +15,16 @@ fi #Make sure that tag being pushed will not overwrite an existing tag in mcr MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`" -if [ $? -ne 0 ]; then +if [ $? -ne 0 ]; then echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository" exit 1 fi TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_RELEASE$AGENT_IMAGE_TAG_SUFFIX"'"])') -if $TAG_EXISTS; then - echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" - exit 1 -fi +# if $TAG_EXISTS; then +# echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" +# exit 1 +# fi if [ -z $AGENT_IMAGE_FULL_PATH ]; then echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables" @@ -60,7 +60,7 @@ if [ $? -eq 0 ]; then else echo "-e error failed to login to az with managed identity credentials" exit 1 -fi +fi echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}" az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH From 3d092c81be9b9070e98e5cfbd078918e7eed69ce Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 6 Mar 2022 21:45:25 -0800 Subject: [PATCH 47/65] wip --- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index e1b9df93a..c8338c01d 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -21,10 +21,10 @@ if [ $? -ne 0 ]; then fi TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_RELEASE$AGENT_IMAGE_TAG_SUFFIX"'"])') -# if $TAG_EXISTS; then -# echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" -# exit 1 -# fi +if $TAG_EXISTS; then + echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" + exit 1 +fi if [ -z $AGENT_IMAGE_FULL_PATH ]; then echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables" From 933f2a370b4b95acccd32932f19db1cb0d0a549c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 4 Apr 2022 20:18:34 -0700 Subject: [PATCH 48/65] fix trailing whitespaces --- .../azuremonitor-containers/templates/omsagent-secret.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml index 8c245338c..bf4e7eb3b 100644 --- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml @@ -15,14 +15,14 @@ data: DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }} {{- $httpsProxyDict := urlParse .Values.Azure.proxySettings.httpsProxy -}} {{- $httpProxyDict := urlParse .Values.Azure.proxySettings.httpProxy -}} - {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} + {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} PROXY: {{ .Values.Azure.proxySettings.httpsProxy | b64enc | quote }} - {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }} # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth PROXY: {{ urlJoin (dict "scheme" $httpsProxyDict.scheme "userinfo" "admin:secret" "host" $httpsProxyDict.host) | b64enc | quote }} {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) ($httpProxyDict.userinfo) }} PROXY: {{ .Values.Azure.proxySettings.httpProxy | b64enc | quote }} - {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth PROXY: {{ urlJoin (dict "scheme" $httpProxyDict.scheme "userinfo" "admin:secret" "host" $httpProxyDict.host) | b64enc | quote }} {{- else if ne .Values.omsagent.proxy "" }} From 3047e7623427c2e065f331e42350c0de8f269c57 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 20 Apr 2022 21:22:36 -0700 Subject: [PATCH 49/65] fix bug --- source/plugins/ruby/KubernetesApiClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 7f8cd0498..ffd76bfbd 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -822,7 +822,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil) resourceInventory = nil begin @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" - responseCode, resourceInfo = getKubeResourceInfo(uri, api_group: api_group) + resourceInfo = getKubeResourceInfo(uri, api_group: api_group) @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" if !resourceInfo.nil? @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" From e706feeec87c0e4a80b8cbb009bd15bac4d471d7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 20 Apr 2022 21:49:01 -0700 Subject: [PATCH 50/65] remove unused envvars in yaml --- kubernetes/omsagent.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 96ea0c982..8bbdf9911 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -667,11 +667,9 @@ spec: valueFrom: resourceFieldRef: containerName: omsagent - resource: limits.cpu - # - name: MONITORING_MAX_EVENT_RATE - # value: "50000" # default 20KPS for MDSD, for large cluster validate 50KPS + resource: limits.cpu - name: EMIT_CACHE_TELEMETRY - value: "true" # enable only debug or test purpose and disable for prod + value: "false" # enable only debug or test purpose and disable for prod - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION From 1ac6672753beaaa395ce8c4575377f154cbcd9f1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 21 Apr 2022 17:23:23 -0700 Subject: [PATCH 51/65] revert minor things --- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 4 ++-- kubernetes/omsagent.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index de306b50a..25eb43f47 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -15,7 +15,7 @@ fi #Make sure that tag being pushed will not overwrite an existing tag in mcr MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`" -if [ $? -ne 0 ]; then +if [ $? -ne 0 ]; then echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository" exit 1 fi @@ -67,7 +67,7 @@ if [ $? -eq 0 ]; then else echo "-e error failed to login to az with managed identity credentials" exit 1 -fi +fi echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}" az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 8bbdf9911..8cbd7412b 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -657,8 +657,8 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 5 - memory: 5Gi + cpu: 1 + memory: 1Gi requests: cpu: 150m memory: 250Mi @@ -667,7 +667,7 @@ spec: valueFrom: resourceFieldRef: containerName: omsagent - resource: limits.cpu + resource: limits.cpu - name: EMIT_CACHE_TELEMETRY value: "false" # enable only debug or test purpose and disable for prod - name: AKS_RESOURCE_ID From 4bb069e5c666bb8588a031e8794a29c8e3f51e90 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 21 Apr 2022 18:17:53 -0700 Subject: [PATCH 52/65] telemetry tags for preview release --- kubernetes/linux/Dockerfile | 2 +- kubernetes/windows/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index becbe1157..3044f0aa2 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod03172022 +ARG IMAGE_TAG=ciprodpreview04222022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 7c514a777..87e7454c0 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod03172022 +ARG IMAGE_TAG=win-ciprodpreview04222022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From 1262c8a84b16c7a4bab3a5c3df1aaa8acdf60399 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 28 Apr 2022 19:48:05 -0700 Subject: [PATCH 53/65] revert preview image tags --- kubernetes/linux/Dockerfile | 2 +- kubernetes/windows/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 3044f0aa2..becbe1157 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprodpreview04222022 +ARG IMAGE_TAG=ciprod03172022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 87e7454c0..7c514a777 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprodpreview04222022 +ARG IMAGE_TAG=win-ciprod03172022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From 5a67c0c0ee429de301b6b260812ec18b354d85dc Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 1 May 2022 08:44:15 -0700 Subject: [PATCH 54/65] revert unintended change --- .../ruby/kubernetes_container_inventory.rb | 63 +++++++++---------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index ffe92ec40..82e36c8cc 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] - # image can be in any one of below format in spec - # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image imageValue = containerInfoMap["image"] if !imageValue.nil? && !imageValue.empty? # Find delimiters in image format atLocation = imageValue.index("@") - isDigestSpecified = false + isDigestSpecified = false if !atLocation.nil? # repository/image@digest or repository/image:imagetag@digest, image@digest imageValue = imageValue[0..(atLocation - 1)] # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? - containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] end isDigestSpecified = true end @@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !colonLocation.nil? if slashLocation.nil? # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] else # repository/image:imagetag containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] end containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - else + else if slashLocation.nil? # image containerInventoryRecord["Image"] = imageValue @@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa # repo/image containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] - end + end # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names - if isDigestSpecified == false + if isDigestSpecified == false containerInventoryRecord["ImageTag"] = "latest" end - end + end end - + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -199,11 +199,7 @@ def getContainersInfoMap(podItem, isWindows) cmdValue = container["command"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s containerInfoMap["Command"] = cmdValueString - if isWindows - containerInfoMap["EnvironmentVar"] = container["env"] - else - containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) - end + containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) containersInfoMap[containerName] = containerInfoMap end end @@ -216,47 +212,47 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin - isCGroupPidFetchRequired = false + isCGroupPidFetchRequired = false if !@@containerCGroupCache.has_key?(containerId) - isCGroupPidFetchRequired = true + isCGroupPidFetchRequired = true else cGroupPid = @@containerCGroupCache[containerId] - if cGroupPid.nil? || cGroupPid.empty? + if cGroupPid.nil? || cGroupPid.empty? isCGroupPidFetchRequired = true @@containerCGroupCache.delete(containerId) - elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") isCGroupPidFetchRequired = true - @@containerCGroupCache.delete(containerId) - end + @@containerCGroupCache.delete(containerId) + end end - if isCGroupPidFetchRequired + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if is_number?(cGroupPid) + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] + tempCGroupPid = @@containerCGroupCache[containerId] if tempCGroupPid.to_i > cGroupPid.to_i @@containerCGroupCache[containerId] = cGroupPid end else @@containerCGroupCache[containerId] = cGroupPid - end + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -269,7 +265,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? @@ -380,7 +376,6 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end - def is_number?(value) true if Integer(value) rescue false end From 3f2e05f24ab342c26d4623255a9bb9e6ea362eb5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 11 May 2022 10:57:31 -0700 Subject: [PATCH 55/65] fix bug --- .../ruby/kubernetes_container_inventory.rb | 64 ++++++++++--------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 82e36c8cc..81889b61b 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,7 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -84,19 +84,19 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] - # image can be in any one of below format in spec - # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image imageValue = containerInfoMap["image"] if !imageValue.nil? && !imageValue.empty? # Find delimiters in image format atLocation = imageValue.index("@") - isDigestSpecified = false + isDigestSpecified = false if !atLocation.nil? # repository/image@digest or repository/image:imagetag@digest, image@digest imageValue = imageValue[0..(atLocation - 1)] # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? - containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] end isDigestSpecified = true end @@ -105,14 +105,14 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !colonLocation.nil? if slashLocation.nil? # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] else # repository/image:imagetag containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] end containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - else + else if slashLocation.nil? # image containerInventoryRecord["Image"] = imageValue @@ -120,15 +120,15 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa # repo/image containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] - end + end # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names - if isDigestSpecified == false + if isDigestSpecified == false containerInventoryRecord["ImageTag"] = "latest" end - end + end end - + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -199,7 +199,12 @@ def getContainersInfoMap(podItem, isWindows) cmdValue = container["command"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s containerInfoMap["Command"] = cmdValueString - containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + if isWindows + # For windows container inventory, we dont need to get envvars from pods response since its already taken care in KPI as part of pod optimized item + containerInfoMap["EnvironmentVar"] = container["env"] + else + containerInfoMap["EnvironmentVar"] = obtainContainerEnvironmentVarsFromPodsResponse(podItem, container) + end containersInfoMap[containerName] = containerInfoMap end end @@ -212,47 +217,47 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin - isCGroupPidFetchRequired = false + isCGroupPidFetchRequired = false if !@@containerCGroupCache.has_key?(containerId) - isCGroupPidFetchRequired = true + isCGroupPidFetchRequired = true else cGroupPid = @@containerCGroupCache[containerId] - if cGroupPid.nil? || cGroupPid.empty? + if cGroupPid.nil? || cGroupPid.empty? isCGroupPidFetchRequired = true @@containerCGroupCache.delete(containerId) - elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") isCGroupPidFetchRequired = true - @@containerCGroupCache.delete(containerId) - end + @@containerCGroupCache.delete(containerId) + end end - if isCGroupPidFetchRequired + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if is_number?(cGroupPid) + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] + tempCGroupPid = @@containerCGroupCache[containerId] if tempCGroupPid.to_i > cGroupPid.to_i @@containerCGroupCache[containerId] = cGroupPid end else @@containerCGroupCache[containerId] = cGroupPid - end + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -265,7 +270,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? @@ -376,6 +381,7 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end + def is_number?(value) true if Integer(value) rescue false end From 7dad848ae8aae4d21614bcb05fe665f0155b3ad6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 12 May 2022 23:08:40 -0700 Subject: [PATCH 56/65] use same batchtime for both mdm & podinventory records --- source/plugins/ruby/in_kube_podinventory.rb | 17 ++++++++++------ .../plugins/ruby/in_kube_podmdminventory.rb | 20 +++++++++---------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 326c85895..2fbdb074c 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -148,7 +148,7 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - @mdmPodRecords = [] + @mdmPodRecordItems = [] podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") @@ -211,7 +211,7 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil - @mdmPodRecords = nil + @mdmPodRecordItems = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -351,11 +351,16 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if continuationToken.nil? #no more chunks in this batch to be sent, write all mdm pod inventory records to send if CustomMetricsUtils.check_custom_metrics_availability begin - if !@mdmPodRecords.nil? && @mdmPodRecords.length > 0 + if !@mdmPodRecordItems.nil? && @mdmPodRecordItems.length > 0 + mdmPodRecords = { + "collectionTime": batchTime, + "items": @mdmPodRecordItems, + } mdmPodRecordsJson = @mdmPodRecords.to_json @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" writeMDMRecords(mdmPodRecordsJson) + mdmPodRecords = nil @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" end rescue => err @@ -647,7 +652,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) records.push(record) end #container status block end - @mdmPodRecords.push(mdmPodRecord.dup) + @mdmPodRecordItems.push(mdmPodRecord.dup) records.each do |record| if !record.nil? @@ -1175,11 +1180,11 @@ def writeMDMRecords(mdmRecordsJson) raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write" end rescue => err - if retryAttemptCount < maxRetryCount + if retryAttemptCount <= maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? - retryAttemptCount = retryAttemptCount + 1 sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 retry end $log.warn "in_kube_podinventory:writeMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 8272420c3..40a5c73d6 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -62,9 +62,7 @@ def enumerate if !@isCustomMetricsAvailability $log.warn "in_kube_podmdminventory::enumerate:skipping since custom metrics not available either for this cluster type or the region" else - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - parse_and_emit_records(batchTime) + parse_and_emit_records() end rescue => errorStr $log.warn "in_kube_podmdminventory::enumerate:Failed in enumerate: #{errorStr}" @@ -73,13 +71,15 @@ def enumerate end end - def parse_and_emit_records(batchTime = Time.utc.iso8601) + def parse_and_emit_records() begin $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}" mdmPodRecords = getMDMRecords() - $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" - if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords.length > 0 - mdmPodRecords.each do |record| + mdmPodRecordItems = + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" + if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0 + batchTime = mdmPodRecords["collectionTime"] # This is time KubePODinventory plugin collected + mdmPodRecords["items"].each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) containerRecords = record["containerRecords"] @@ -180,7 +180,7 @@ def getMDMRecords() maxRetryCount = 3 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 - mdmRecords = [] + mdmRecords = {} begin f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") if !f.nil? @@ -194,11 +194,11 @@ def getMDMRecords() raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" end rescue => err - if retryAttemptCount < maxRetryCount + if retryAttemptCount <= maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? - retryAttemptCount = retryAttemptCount + 1 sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 retry end $log.warn "in_kube_podmdminventory:getMDMRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" From f49dffdf4f00bbe12957e29a08d578c896bb02d3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 13 May 2022 10:07:19 -0700 Subject: [PATCH 57/65] use same batchtime for both mdm & podinventory records --- source/plugins/ruby/in_kube_podinventory.rb | 3 ++- source/plugins/ruby/in_kube_podmdminventory.rb | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 2fbdb074c..b84b53d28 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -356,11 +356,12 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc "collectionTime": batchTime, "items": @mdmPodRecordItems, } - mdmPodRecordsJson = @mdmPodRecords.to_json + mdmPodRecordsJson = mdmPodRecords.to_json @log.info "Writing pod inventory mdm records to mdm podinventory state file with size(bytes): #{mdmPodRecordsJson.length}" @log.info "in_kube_podinventory::parse_and_emit_records:Start:writeMDMRecords @ #{Time.now.utc.iso8601}" writeMDMRecords(mdmPodRecordsJson) mdmPodRecords = nil + mdmPodRecordsJson = nil @log.info "in_kube_podinventory::parse_and_emit_records:End:writeMDMRecords @ #{Time.now.utc.iso8601}" end rescue => err diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 40a5c73d6..5f5aff714 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -78,7 +78,7 @@ def parse_and_emit_records() mdmPodRecordItems = $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0 - batchTime = mdmPodRecords["collectionTime"] # This is time KubePODinventory plugin collected + batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory mdmPodRecords["items"].each do |record| @inventoryToMdmConvertor.process_pod_inventory_record(record) @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], record["PodReadyCondition"]) From f4824b297dd96de7b47a31ca250093cd1b014cf3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 14 May 2022 16:39:55 -0700 Subject: [PATCH 58/65] use same batchtime for both mdm & podinventory records --- source/plugins/ruby/in_kube_podmdminventory.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 5f5aff714..a7d8c4765 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -189,7 +189,7 @@ def getMDMRecords() startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" end From ab3b042e4ba22a9e78b0d0631f89717720beb1a8 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 15 May 2022 13:56:15 -0700 Subject: [PATCH 59/65] use same batchtime for both mdm & podinventory records --- source/plugins/ruby/in_kube_podmdminventory.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index a7d8c4765..5be9bc99c 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -40,6 +40,7 @@ def start $log.info("in_kube_podmdminventory::start @ #{Time.now.utc.iso8601}") @isCustomMetricsAvailability = CustomMetricsUtils.check_custom_metrics_availability @finished = false + @prevCollectionTime = nil @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @@ -177,7 +178,7 @@ def run_periodic end def getMDMRecords() - maxRetryCount = 3 + maxRetryCount = 5 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 mdmRecords = {} @@ -189,6 +190,10 @@ def getMDMRecords() startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) + if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime + raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale" + end + @prevCollectionTime = mdmRecords["collectionTime"] $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" From e39a120144022c4fcd4dba5f4c5e49ff6a394466 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 20 May 2022 10:52:03 -0700 Subject: [PATCH 60/65] preview image tag with latest ci_dev changes --- kubernetes/linux/Dockerfile.multiarch | 2 +- kubernetes/windows/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch index fd0330d5d..133f40178 100644 --- a/kubernetes/linux/Dockerfile.multiarch +++ b/kubernetes/linux/Dockerfile.multiarch @@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/ COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod05192022 +ARG IMAGE_TAG=ciprodpreview05202022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 383652e0e..e74d05e96 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -5,7 +5,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod05192022 +ARG IMAGE_TAG=win-ciprodpreview05202022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From cae999b7a1e2b1385b53a3aac4d77d6bbf5b7660 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 8 Jun 2022 19:43:34 -0700 Subject: [PATCH 61/65] change back to use prod image in docker files --- kubernetes/linux/Dockerfile.multiarch | 2 +- kubernetes/windows/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch index 133f40178..fd0330d5d 100644 --- a/kubernetes/linux/Dockerfile.multiarch +++ b/kubernetes/linux/Dockerfile.multiarch @@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/ COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprodpreview05202022 +ARG IMAGE_TAG=ciprod05192022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index e74d05e96..383652e0e 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -5,7 +5,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprodpreview05202022 +ARG IMAGE_TAG=win-ciprod05192022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From b4e5427d5df5f5632446546405a7e2a3c8565564 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 10 Jun 2022 18:58:05 -0700 Subject: [PATCH 62/65] fix unit test failures --- kubernetes/linux/main.sh | 4 +- source/plugins/ruby/in_kube_nodes.rb | 274 ++++++++++++---------- source/plugins/ruby/in_kube_nodes_test.rb | 118 +++++----- 3 files changed, 210 insertions(+), 186 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c45ef6024..1e00457d9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -95,8 +95,8 @@ setReplicaSetSpecificConfig() { export FLUENTD_KUBE_PERF_QUEUE_LIMIT_LENGTH="20" export FLUENTD_MDM_FLUSH_THREAD_COUNT="5" # default case $NUM_OF_FLUENTD_WORKERS in - 5) - export NUM_OF_FLUENTD_WORKERS=5 + [5-9]|9[0-9]|100) + export NUM_OF_FLUENTD_WORKERS=5 # Max is 5 core even if the specified limits more than 5 cores export FLUENTD_POD_INVENTORY_WORKER_ID=4 export FLUENTD_NODE_INVENTORY_WORKER_ID=3 export FLUENTD_EVENT_INVENTORY_WORKER_ID=2 diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 8a017243c..690a1ca8c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -7,11 +7,12 @@ module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - def initialize(kubernetesApiClient = nil, + def initialize(is_unit_test_mode = nil, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, - telemetry_flush_interval = nil) + telemetry_flush_interval = nil, + node_items_test_cache = nil) super() require "yaml" @@ -30,6 +31,8 @@ def initialize(kubernetesApiClient = nil, @extensionUtils = extensionUtils == nil ? ExtensionUtils : extensionUtils @env = env == nil ? ENV : env @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = telemetry_flush_interval == nil ? Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES : telemetry_flush_interval + @is_unit_test_mode = is_unit_test_mode == nil ? false : true + @node_items_test_cache = node_items_test_cache # these defines were previously at class scope Moving them into the constructor so that they can be set by unit tests @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@ -65,6 +68,7 @@ def initialize(kubernetesApiClient = nil, @NodeCache = NodeStatsCache.new() @watchNodesThread = nil @nodeItemsCache = {} + @nodeItemsCacheSizeKB = 0 end config_param :run_interval, :time, :default => 60 @@ -153,14 +157,9 @@ def enumerate # Initializing continuation token to nil continuationToken = nil nodeInventory = {} - nodeItemsCacheSizeKB = 0 + @nodeItemsCacheSizeKB = 0 nodeCount = 0 - @nodeCacheMutex.synchronize { - nodeInventory["items"] = @nodeItemsCache.values.clone - if KubernetesApiClient.isEmitCacheTelemetry() - nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 - end - } + nodeInventory["items"] = getNodeItemsFromCache() nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) @@ -178,7 +177,7 @@ def enumerate @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) telemetryProperties = {} if KubernetesApiClient.isEmitCacheTelemetry() - telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = nodeItemsCacheSizeKB + telemetryProperties["NODE_ITEMS_CACHE_SIZE_KB"] = @nodeItemsCacheSizeKB end ApplicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, telemetryProperties) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -596,58 +595,110 @@ def getNodeTelemetryProps(item) end def watch_nodes - $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - loop do - begin - if nodesResourceVersion.nil? - # clear cache before filling the cache with list - @nodeCacheMutex.synchronize { - @nodeItemsCache.clear() - } - continuationToken = nil - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") - continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) - if responseCode.nil? || responseCode != "200" - $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - if !nodeItem.nil? && !nodeItem.empty? - @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem - } + if !@is_unit_test_mode + $log.info("in_kube_nodes::watch_nodes:Start @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + loop do + begin + if nodesResourceVersion.nil? + # clear cache before filling the cache with list + @nodeCacheMutex.synchronize { + @nodeItemsCache.clear() + } + continuationToken = nil + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + $log.info("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri) + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes:Done getting nodes from Kube API: #{resourceUri} @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes: number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + end + end + else + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" + end + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") + if responseCode.nil? || responseCode != "200" + $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again + break + else + if (!nodeInventory.nil? && !nodeInventory.empty?) + nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] + if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory["items"].each do |item| + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? + nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) + if !nodeItem.nil? && !nodeItem.empty? + @nodeCacheMutex.synchronize { + @nodeItemsCache[key] = nodeItem + } + else + $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" + end + else + $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + end + end end else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" + $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end end end - else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" end - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, nodeInventory, responseCode = KubernetesApiClient.getResourcesAndContinuationTokenV2(resourceUri + "&continue=#{continuationToken}") - if responseCode.nil? || responseCode != "200" - $log.warn("in_kube_nodes::watch_nodes:Getting nodes from Kube API: #{resourceUri}&continue=#{continuationToken} failed with statuscode: #{responseCode} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil # break, if any of the pagination call failed so that full cache can be rebuild with LIST again - break + end + if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" + # https://github.com/kubernetes/kubernetes/issues/74022 + $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil # for the LIST to happen again + sleep(30) # do not overwhelm the api-server if api-server broken + else + begin + $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) + if watcher.nil? + $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") else - if (!nodeInventory.nil? && !nodeInventory.empty?) - nodesResourceVersion = nodeInventory["metadata"]["resourceVersion"] - if (nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - $log.info("in_kube_nodes::watch_nodes : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory["items"].each do |item| + watcher.each do |notice| + case notice["type"] + when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" + item = notice["object"] + # extract latest resource version to use for watch reconnect + if !item.nil? && !item.empty? && + !item["metadata"].nil? && !item["metadata"].empty? && + !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? + nodesResourceVersion = item["metadata"]["resourceVersion"] + # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") + else + $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + break + end + if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) key = item["metadata"]["uid"] if !key.nil? && !key.empty? nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) @@ -661,93 +712,43 @@ def watch_nodes else $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" end - end - end - else - $log.warn "in_kube_nodes::watch_nodes:Received empty nodeInventory @ #{Time.now.utc.iso8601}" - end - end - end - end - end - if nodesResourceVersion.nil? || nodesResourceVersion.empty? || nodesResourceVersion == "0" - # https://github.com/kubernetes/kubernetes/issues/74022 - $log.warn("in_kube_nodes::watch_nodes:received nodesResourceVersion either nil or empty or 0 @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil # for the LIST to happen again - sleep(30) # do not overwhelm the api-server if api-server broken - else - begin - $log.info("in_kube_nodes::watch_nodes:Establishing Watch connection for nodes with resourceversion: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - watcher = KubernetesApiClient.watch("nodes", resource_version: nodesResourceVersion, allow_watch_bookmarks: true) - if watcher.nil? - $log.warn("in_kube_nodes::watch_nodes:watch API returned nil watcher for watch connection with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - watcher.each do |notice| - case notice["type"] - when "ADDED", "MODIFIED", "DELETED", "BOOKMARK" - item = notice["object"] - # extract latest resource version to use for watch reconnect - if !item.nil? && !item.empty? && - !item["metadata"].nil? && !item["metadata"].empty? && - !item["metadata"]["resourceVersion"].nil? && !item["metadata"]["resourceVersion"].empty? - nodesResourceVersion = item["metadata"]["resourceVersion"] - # $log.info("in_kube_nodes::watch_nodes: received event type: #{notice["type"]} with resource version: #{nodesResourceVersion} @ #{Time.now.utc.iso8601}") - else - $log.info("in_kube_nodes::watch_nodes: received event type with no resourceVersion hence stopping watcher to reconnect @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - # We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! - break - end - if ((notice["type"] == "ADDED") || (notice["type"] == "MODIFIED")) - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - nodeItem = KubernetesApiClient.getOptimizedItem("nodes", item) - if !nodeItem.nil? && !nodeItem.empty? + elsif notice["type"] == "DELETED" + key = item["metadata"]["uid"] + if !key.nil? && !key.empty? @nodeCacheMutex.synchronize { - @nodeItemsCache[key] = nodeItem + @nodeItemsCache.delete(key) } - else - $log.warn "in_kube_nodes::watch_nodes:Received nodeItem nil or empty @ #{Time.now.utc.iso8601}" end - else - $log.warn "in_kube_nodes::watch_nodes:Received node uid either nil or empty @ #{Time.now.utc.iso8601}" - end - elsif notice["type"] == "DELETED" - key = item["metadata"]["uid"] - if !key.nil? && !key.empty? - @nodeCacheMutex.synchronize { - @nodeItemsCache.delete(key) - } end + when "ERROR" + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") + break + else + nodesResourceVersion = nil + $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") + break end - when "ERROR" - nodesResourceVersion = nil - $log.warn("in_kube_nodes::watch_nodes:ERROR event with :#{notice["object"]} @ #{Time.now.utc.iso8601}") - break - else - nodesResourceVersion = nil - $log.warn("in_kube_nodes::watch_nodes:Unsupported event type #{notice["type"]} @ #{Time.now.utc.iso8601}") - break end end + rescue Net::ReadTimeout => errorStr + ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection + # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil + sleep(5) # do not overwhelm the api-server if api-server broken + ensure + watcher.finish if watcher end - rescue Net::ReadTimeout => errorStr - ## This expected if there is no activity on the cluster for more than readtimeout value used in the connection - # $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil - sleep(5) # do not overwhelm the api-server if api-server broken - ensure - watcher.finish if watcher end + rescue => errorStr + $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") + nodesResourceVersion = nil end - rescue => errorStr - $log.warn("in_kube_nodes::watch_nodes:failed with an error: #{errorStr} @ #{Time.now.utc.iso8601}") - nodesResourceVersion = nil end + $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}") end - $log.info("in_kube_nodes::watch_nodes:End @ #{Time.now.utc.iso8601}") end def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) @@ -782,6 +783,21 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) f.close if !f.nil? end end + + def getNodeItemsFromCache() + nodeItems = {} + if @is_unit_test_mode + nodeItems = @node_items_test_cache + else + @nodeCacheMutex.synchronize { + nodeItems = @nodeItemsCache.values.clone + if KubernetesApiClient.isEmitCacheTelemetry() + @nodeItemsCacheSizeKB = @nodeItemsCache.to_s.length / 1024 + end + } + end + return nodeItems + end end # Kube_Node_Input class NodeStatsCache diff --git a/source/plugins/ruby/in_kube_nodes_test.rb b/source/plugins/ruby/in_kube_nodes_test.rb index 8f4984c6c..7d55ea32d 100644 --- a/source/plugins/ruby/in_kube_nodes_test.rb +++ b/source/plugins/ruby/in_kube_nodes_test.rb @@ -1,10 +1,10 @@ -require 'minitest/autorun' +require "minitest/autorun" -require 'fluent/test' -require 'fluent/test/driver/input' -require 'fluent/test/helpers' +require "fluent/test" +require "fluent/test/driver/input" +require "fluent/test/helpers" -require_relative 'in_kube_nodes.rb' +require_relative "in_kube_nodes.rb" class InKubeNodesTests < Minitest::Test include Fluent::Test::Helpers @@ -13,20 +13,22 @@ def setup Fluent::Test.setup end - def create_driver(conf = {}, kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil, telemetry_flush_interval=nil) - Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(kubernetesApiClient=kubernetesApiClient, - applicationInsightsUtility=applicationInsightsUtility, - extensionUtils=extensionUtils, - env=env)).configure(conf) + def create_driver(conf = {}, is_unit_test_mode = true, kubernetesApiClient = nil, applicationInsightsUtility = nil, extensionUtils = nil, env = nil, telemetry_flush_interval = nil, node_items_test_cache) + Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(is_unit_test_mode, kubernetesApiClient = kubernetesApiClient, + applicationInsightsUtility = applicationInsightsUtility, + extensionUtils = extensionUtils, + env = env, + telemetry_flush_interval, + node_items_test_cache)).configure(conf) end # Collection time of scrapped data will always be different. Overwrite it in any records returned by in_kube_ndes.rb def overwrite_collection_time(data) if data.key?("CollectionTime") - data["CollectionTime"] = "~CollectionTime~" + data["CollectionTime"] = "~CollectionTime~" end if data.key?("Timestamp") - data["Timestamp"] = "~Timestamp~" + data["Timestamp"] = "~Timestamp~" end return data end @@ -45,41 +47,46 @@ def test_basic_single_node # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking # but it doesn't track how many times isAADMSIAuthMode is called def extensionUtils.isAADMSIAuthMode - false + false end nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes.txt").read) - kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + node_items_test_cache = nodes_api_response["items"] + kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterId, "/cluster-id") + def appInsightsUtil.sendExceptionTelemetry(exception) + if exception.to_s != "undefined method `[]' for nil:NilClass" + raise "an unexpected exception has occured" + end + end config = "run_interval 999999999" # only run once - d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env) + d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, node_items_test_cache) d.instance.start d.instance.enumerate d.run(timeout: 99999) # Input plugins decide when to run, so we have to give it enough time to run - - expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, - ["mdm.kubenodeinventory", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]"})] => true, - ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]"})] => true} + expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true, + ["mdm.kubenodeinventory", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "aks-nodepool1-24816391-vmss000000", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" })] => true, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({ "CollectionTime" => "2021-08-17T20:24:18Z", "Computer" => "aks-nodepool1-24816391-vmss000000", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]" })] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({ "Timestamp" => "2021-08-17T20:24:18Z", "Host" => "aks-nodepool1-24816391-vmss000000", "Computer" => "aks-nodepool1-24816391-vmss000000", "ObjectName" => "K8SNode", "InstanceName" => "None/aks-nodepool1-24816391-vmss000000", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]" })] => true } d.events.each do |tag, time, record| - cleaned_record = overwrite_collection_time record - if expected_responses.key?([tag, cleaned_record]) - expected_responses[[tag, cleaned_record]] = true - else - assert(false, "got unexpected record") - end + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + else + assert(false, "got unexpected record: #{cleaned_record}") + end end expected_responses.each do |key, val| - assert(val, "expected record not emitted: #{key}") + assert(val, "expected record not emitted: #{key}") end # make sure all mocked methods were called the expected number of times @@ -104,7 +111,7 @@ def test_malformed_node_spec # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking # but it doesn't track how many times isAADMSIAuthMode is called def extensionUtils.isAADMSIAuthMode - false + false end # Set up the KubernetesApiClient Mock. Note: most of the functions in KubernetesApiClient are pure (access no @@ -112,16 +119,17 @@ def extensionUtils.isAADMSIAuthMode # more brittle). Instead, in_kube_nodes bypasses the mock and directly calls these functions in KubernetesApiClient. # Ideally the pure functions in KubernetesApiClient would be refactored into their own file to reduce confusion. nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes-malformed.txt").read) - kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + node_items_test_cache = nodes_api_response["items"] + kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterName, "/cluster-name") kubeApiClient.expect(:getClusterId, "/cluster-id") kubeApiClient.expect(:getClusterId, "/cluster-id") def appInsightsUtil.sendExceptionTelemetry(exception) - if exception.to_s != "undefined method `[]' for nil:NilClass" - raise "an unexpected exception has occured" - end + if exception.to_s != "undefined method `[]' for nil:NilClass" + raise "an unexpected exception has occured" + end end # This test doesn't care if metric telemetry is sent properly. Looking for an unnecessary value would make it needlessly rigid @@ -130,38 +138,38 @@ def appInsightsUtil.sendMetricTelemetry(a, b, c) config = "run_interval 999999999" # only run once - d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env, telemetry_flush_interval=0) + d = create_driver(config, true, kubernetesApiClient = kubeApiClient, applicationInsightsUtility = appInsightsUtil, extensionUtils = extensionUtils, env = env, telemetry_flush_interval = 0, node_items_test_cache) d.instance.start d.instance.enumerate d.run(timeout: 99999) #TODO: is this necessary? expected_responses = { - ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]"}] => false, - ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]"}] => false, - - # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records) - ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, - ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "correct-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "correct-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]" }] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", { "Timestamp" => "~Timestamp~", "Host" => "correct-node", "Computer" => "correct-node", "ObjectName" => "K8SNode", "InstanceName" => "None/correct-node", "json_Collections" => "[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]" }] => false, + + # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records) + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["mdm.kubenodeinventory", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "ClusterName" => "/cluster-name", "ClusterId" => "/cluster-id", "CreationTimeStamp" => "2021-07-21T23:40:14Z", "Labels" => [{ "agentpool" => "nodepool1", "beta.kubernetes.io/arch" => "amd64", "beta.kubernetes.io/instance-type" => "Standard_DS2_v2", "beta.kubernetes.io/os" => "linux", "failure-domain.beta.kubernetes.io/region" => "westus2", "failure-domain.beta.kubernetes.io/zone" => "0", "kubernetes.azure.com/cluster" => "MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode" => "system", "kubernetes.azure.com/node-image-version" => "AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku" => "Ubuntu", "kubernetes.azure.com/role" => "agent", "kubernetes.io/arch" => "amd64", "kubernetes.io/hostname" => "malformed-node", "kubernetes.io/os" => "linux", "kubernetes.io/role" => "agent", "node-role.kubernetes.io/agent" => "", "node.kubernetes.io/instance-type" => "Standard_DS2_v2", "storageprofile" => "managed", "storagetier" => "Premium_LRS", "topology.kubernetes.io/region" => "westus2", "topology.kubernetes.io/zone" => "0" }], "Status" => "Ready", "KubernetesProviderID" => "azure", "LastTransitionTimeReady" => "2021-07-21T23:40:24Z", "KubeletVersion" => "v1.19.11", "KubeProxyVersion" => "v1.19.11" }] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", { "CollectionTime" => "~CollectionTime~", "Computer" => "malformed-node", "OperatingSystem" => "Ubuntu 18.04.5 LTS", "DockerVersion" => "containerd://1.4.4+azure" }] => false, } d.events.each do |tag, time, record| - cleaned_record = overwrite_collection_time record - if expected_responses.key?([tag, cleaned_record]) - expected_responses[[tag, cleaned_record]] = true - end - # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data. - # we care more that the non-malformed data is still emitted + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + end + # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data. + # we care more that the non-malformed data is still emitted end expected_responses.each do |key, val| - assert(val, "expected record not emitted: #{key}") + assert(val, "expected record not emitted: #{key}") end kubeApiClient.verify From 81eec6ea39d2306a6fb26fedc53f21112bb3ef9b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 11 Jun 2022 09:56:25 -0700 Subject: [PATCH 63/65] exclude unfixed cve until this get fixed --- .trivyignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.trivyignore b/.trivyignore index f8c029116..56ac504d5 100644 --- a/.trivyignore +++ b/.trivyignore @@ -16,4 +16,4 @@ CVE-2021-31799 CVE-2021-28965 #dpkg vulnerability in ubuntu -CVE-2022-1664 \ No newline at end of file +CVE-2022-1304 \ No newline at end of file From 1a3fa0ea200332f8cb6f91c6fb68ea4afd3ef66d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 13 Jun 2022 12:24:50 -0700 Subject: [PATCH 64/65] fix minor issue --- source/plugins/ruby/in_kube_podinventory.rb | 2 +- source/plugins/ruby/in_kube_podmdminventory.rb | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index b84b53d28..bdbc465ec 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1164,7 +1164,7 @@ def watch_windows_nodes end def writeMDMRecords(mdmRecordsJson) - maxRetryCount = 3 + maxRetryCount = 5 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index 5be9bc99c..bfc5227f3 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -76,8 +76,7 @@ def parse_and_emit_records() begin $log.info "in_kube_podmdminventory:parse_and_emit_records:Start:getMDMRecords @ #{Time.now.utc.iso8601}" mdmPodRecords = getMDMRecords() - mdmPodRecordItems = - $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:parse_and_emit_records:End:getMDMRecords @ #{Time.now.utc.iso8601}" if !mdmPodRecords.nil? && !mdmPodRecords.empty? && mdmPodRecords["items"].length > 0 batchTime = mdmPodRecords["collectionTime"] # This is same batchTime used in KubePODinventory mdmPodRecords["items"].each do |record| From 7f3372a96dbbd855e1d5db10273d2c3b27d47d72 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 13 Jun 2022 23:21:23 -0700 Subject: [PATCH 65/65] increase retries to handle transient errors --- source/plugins/ruby/in_kube_nodes.rb | 2 +- source/plugins/ruby/in_kube_perfinventory.rb | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 690a1ca8c..a3cbb5a85 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -752,7 +752,7 @@ def watch_nodes end def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) - maxRetryCount = 3 + maxRetryCount = 5 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index 50552a25d..ad8fdbf21 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -103,7 +103,6 @@ def enumerate(podList = nil) end nodeAllocatableRecords = getNodeAllocatableRecords() - $log.info("in_kube_perfinventory::enumerate : number of nodeAllocatableRecords :#{nodeAllocatableRecords.length} from Kube API @ #{Time.now.utc.iso8601}") # Initializing continuation token to nil continuationToken = nil podItemsCacheSizeKB = 0 @@ -398,7 +397,7 @@ def watch_pods end def getNodeAllocatableRecords() - maxRetryCount = 3 + maxRetryCount = 5 initialRetryDelaySecs = 0.5 retryAttemptCount = 1 nodeAllocatableRecords = {} @@ -418,8 +417,8 @@ def getNodeAllocatableRecords() if retryAttemptCount < maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? - retryAttemptCount = retryAttemptCount + 1 sleep (initialRetryDelaySecs * retryAttemptCount) + retryAttemptCount = retryAttemptCount + 1 retry end $log.warn "in_kube_perfinventory:getNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}"