From 82d91c2da509c4f702e11626997390c86c4b0cc6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 15:03:31 -0800 Subject: [PATCH 01/45] optimize kpi --- kubernetes/omsagent.yaml | 17 +- source/plugins/ruby/KubernetesApiClient.rb | 387 ++++++----- source/plugins/ruby/in_kube_podinventory.rb | 714 +++++++++++--------- 3 files changed, 611 insertions(+), 507 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 2155361e9..85c383ec2 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -134,7 +134,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -149,7 +149,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer @@ -179,7 +179,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -208,7 +208,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer @@ -564,6 +564,15 @@ spec: periodSeconds: 60 affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + # preferredDuringSchedulingIgnoredDuringExecution: + # - weight: 1 + # preference: + # matchExpressions: + # - key: storageprofile + # operator: NotIn + # values: + # - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 073eb0417..13c084a5c 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -172,6 +172,10 @@ def isAROV3Cluster return @@IsAROV3Cluster end + def isAROv3MasterOrInfraPod(nodeName) + return isAROV3Cluster() && (!nodeName.nil? && (nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-"))) + end + def isNodeMaster return @@IsNodeMaster if !@@IsNodeMaster.nil? @@IsNodeMaster = false @@ -276,7 +280,8 @@ def getPods(namespace) def getWindowsNodes winNodes = [] begin - resourceUri = getNodesResourceUri("nodes") + # get only windows nodes + resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" # Resetting the windows node cache @@ -396,42 +401,67 @@ def getPodUid(podNameSpace, podMetadata) return podUid end - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - podUid = getPodUid(podNameSpace, pod["metadata"]) - if podUid.nil? - next - end - - # For ARO, skip the pods scheduled on to master or infra nodes to ingest - if isAROV3Cluster() && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && - (pod["spec"]["nodeName"].downcase.start_with?("infra-") || - pod["spec"]["nodeName"].downcase.start_with?("master-")) - next - end + podNameSpace = pod["metadata"]["namespace"] + podUid = getPodUid(podNameSpace, pod["metadata"]) + if podUid.nil? + return metricItems + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !pod["spec"]["nodeName"].nil? + nodeName = pod["spec"]["nodeName"] + end + # For ARO, skip the pods scheduled on to master or infra nodes to ingest + if isAROv3MasterOrInfraPod(nodeName) + return metricItems + end - if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] - podContainers.each do |container| - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) + podContainers.each do |container| + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") metricItem = {} metricItem["DataItems"] = [] @@ -451,32 +481,6 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = nodeName - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = nodeName - metricProps["ObjectName"] = "K8SContainer" - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - end end end end @@ -488,78 +492,74 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId clusterName = getClusterName - - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - if pod["metadata"]["annotations"].nil? - next - else - podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] - end + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + if pod["metadata"]["annotations"].nil? + return metricItems else - podUid = pod["metadata"]["uid"] + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] end + else + podUid = pod["metadata"]["uid"] + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end - if (!podContainers.nil? && !podContainers.empty?) - if (!pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] + if (!podContainers.nil? && !podContainers.empty?) + if (!pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + else + nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU + end + podContainers.each do |container| + metricValue = nil + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) else - nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU - end - podContainers.each do |container| - metricValue = nil - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - else - #No container level limit for the given metric, so default to node level limit for non-gpu metrics - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - metricValue = @@NodeMetrics[nodeMetricsHashKey] - end - end - if (!metricValue.nil?) - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = nodeName - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit for non-gpu metrics + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + metricValue = @@NodeMetrics[nodeMetricsHashKey] end end + if (!metricValue.nil?) + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = nodeName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end end end rescue => error @@ -569,92 +569,82 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCat return metricItems end #getContainerResourceRequestAndLimitsAsInsightsMetrics - def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) - metricItems = [] + def parseNodeLimits(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} begin - metricInfo = metricJSON clusterId = getClusterId #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end + if (!node["status"][metricCategory].nil?) + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") end rescue => error @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") end - return metricItems + return metricItem end #parseNodeLimits - def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) - metricItems = [] + def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} begin - metricInfo = metricJSON - clusterId = getClusterId - clusterName = getClusterName #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = node["metadata"]["name"] - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + clusterId = getClusterId + clusterName = getClusterName + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = node["metadata"]["name"] + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect + + metricItem["Tags"] = metricTags + + #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") end - return metricItems + return metricItem end def getMetricNumericValue(metricName, metricVal) @@ -777,5 +767,34 @@ def getKubeAPIServerUrl end return apiServerUrl end + + def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) + kubeServiceRecords = [] + begin + if (!serviceList.nil? && !serviceList.empty?) + servicesCount = serviceList["items"].length + @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") + servicesSizeInKB = (serviceList["items"].to_s.length) / 1024 + @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : size of serviceList in KB #{servicesSizeInKB} @ #{Time.now.utc.iso8601}") + serviceList["items"].each do |item| + kubeServiceRecord = {} + kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + kubeServiceRecord["ServiceName"] = item["metadata"]["name"] + kubeServiceRecord["Namespace"] = item["metadata"]["namespace"] + kubeServiceRecord["SelectorLabels"] = [item["spec"]["selector"]] + # add these before emit to avoid memory foot print + # kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId + # kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterIP"] = item["spec"]["clusterIP"] + kubeServiceRecord["ServiceType"] = item["spec"]["type"] + kubeServiceRecords.push(kubeServiceRecord.dup) + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getKubeServicesInventoryRecords:Failed with an error : #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return kubeServiceRecords + end end end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bba3e920f..0eead7782 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true module Fluent - require_relative "podinventory_to_mdm" + require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input Plugin.register_input("kubepodinventory", self) @@ -19,7 +19,7 @@ def initialize require "yajl" require "set" require "time" - + require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" @@ -27,11 +27,18 @@ def initialize require_relative "omslog" require_relative "constants" + @PODS_EMIT_STREAM = true + @CONTAINER_PERF_EMIT_STREAM = true + @GPU_PERF_EMIT_STREAM = true + @SERVICES_EMIT_STREAM = true + @PODS_CHUNK_SIZE = "1500" @podCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} + # 0 indicates no batch enabled for stream emit + @PODS_EMIT_STREAM_BATCH_SIZE = 0 end config_param :run_interval, :time, :default => 60 @@ -44,6 +51,36 @@ def configure(conf) def start if @run_interval + if !ENV["PODS_EMIT_STREAM"].nil? && !ENV["PODS_EMIT_STREAM"].empty? + @PODS_EMIT_STREAM = ENV["PODS_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM @ #{@PODS_EMIT_STREAM}") + + if !ENV["SERVICES_EMIT_STREAM"].nil? && !ENV["SERVICES_EMIT_STREAM"].empty? + @SERVICES_EMIT_STREAM = ENV["SERVICES_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_podinventory::start : SERVICES_EMIT_STREAM @ #{@SERVICES_EMIT_STREAM}") + + if !ENV["CONTAINER_PERF_EMIT_STREAM"].nil? && !ENV["CONTAINER_PERF_EMIT_STREAM"].empty? + @CONTAINER_PERF_EMIT_STREAM = ENV["CONTAINER_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_podinventory::start : CONTAINER_PERF_EMIT_STREAM @ #{@CONTAINER_PERF_EMIT_STREAM}") + + if !ENV["GPU_PERF_EMIT_STREAM"].nil? && !ENV["GPU_PERF_EMIT_STREAM"].empty? + @GPU_PERF_EMIT_STREAM = ENV["GPU_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_podinventory::start : GPU_PERF_EMIT_STREAM @ #{@GPU_PERF_EMIT_STREAM}") + + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"] + end + $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? + @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + end + $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -72,6 +109,7 @@ def enumerate(podList = nil) @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 + serviceRecords = [] # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") @@ -84,6 +122,9 @@ def enumerate(podList = nil) serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInfo = nil + # service inventory records much smaller size and fixed compared to serviceList + serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) + serviceList = nil end # Initializing continuation token to nil @@ -92,7 +133,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -101,7 +142,7 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -109,7 +150,7 @@ def enumerate(podList = nil) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil - serviceList = nil + serviceRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -137,258 +178,151 @@ def enumerate(podList = nil) $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - end + end - def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray - - podInventory["items"].each do |items| #podInventory block start - containerInventoryRecords = [] - records = [] - record = {} - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Name"] = items["metadata"]["name"] - podNameSpace = items["metadata"]["namespace"] - - # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes - if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && - (items["spec"]["nodeName"].downcase.start_with?("infra-") || - items["spec"]["nodeName"].downcase.start_with?("master-")) - next - end - - podUid = KubernetesApiClient.getPodUid(podNameSpace, items["metadata"]) - if podUid.nil? - next - end - record["PodUid"] = podUid - record["PodLabel"] = [items["metadata"]["labels"]] - record["Namespace"] = podNameSpace - record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] - #for unscheduled (non-started) pods startTime does NOT exist - if !items["status"]["startTime"].nil? - record["PodStartTime"] = items["status"]["startTime"] - else - record["PodStartTime"] = "" - end - #podStatus - # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running - podReadyCondition = true - if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? - items["status"]["conditions"].each do |condition| - if condition["type"] == "Ready" && condition["status"] == "False" - podReadyCondition = false - break - end + podInventory["items"].each do |item| #podInventory block start + # pod inventory records + podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) + podInventoryRecords.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) end end - - if podReadyCondition == false - record["PodStatus"] = "Unknown" - # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home - elsif !items["metadata"]["deletionTimestamp"].nil? && !items["metadata"]["deletionTimestamp"].empty? - record["PodStatus"] = Constants::POD_STATUS_TERMINATING - else - record["PodStatus"] = items["status"]["phase"] - end - #for unscheduled (non-started) pods podIP does NOT exist - if !items["status"]["podIP"].nil? - record["PodIp"] = items["status"]["podIP"] - else - record["PodIp"] = "" - end - #for unscheduled (non-started) pods nodeName does NOT exist - if !items["spec"]["nodeName"].nil? - record["Computer"] = items["spec"]["nodeName"] - else - record["Computer"] = "" - end - # Setting this flag to true so that we can send ContainerInventory records for containers # on windows nodes and parse environment variables for these containers if winNodes.length > 0 - if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - containerInventoryRecordsInPodItem = KubernetesContainerInventory.getContainerInventoryRecords(items, batchTime, clusterCollectEnvironmentVar, true) - containerInventoryRecordsInPodItem.each do |containerRecord| - containerInventoryRecords.push(containerRecord) - end + containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) + # Send container inventory records for containers on windows nodes + @winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end end - record["ClusterId"] = KubernetesApiClient.getClusterId - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) - - if !items["metadata"]["ownerReferences"].nil? - record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] - record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] - @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) - #Adding controller kind to telemetry ro information about customer workload - if (@controllerData[record["ControllerKind"]].nil?) - @controllerData[record["ControllerKind"]] = 1 - else - controllerValue = @controllerData[record["ControllerKind"]] - @controllerData[record["ControllerKind"]] += 1 + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + if @PODS_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + router.emit_stream(@tag, eventStream) if eventStream end + eventStream = MultiEventStream.new end - podRestartCount = 0 - record["PodRestartCount"] = 0 - #Invoke the helper method to compute ready/not ready mdm metric - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], items["status"]["conditions"]) + #container perf records + containerMetricDataItems = [] + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - podContainers = [] - if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? - podContainers = podContainers + items["status"]["containerStatuses"] - end - # Adding init containers to the record list as well. - if items["status"].key?("initContainerStatuses") && !items["status"]["initContainerStatuses"].empty? - podContainers = podContainers + items["status"]["initContainerStatuses"] + containerMetricDataItems.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, record) if record end - # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start - if !podContainers.empty? #container status block start - podContainers.each do |container| - containerRestartCount = 0 - lastFinishedTime = nil - # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container["containerID"].nil? - record["ContainerID"] = container["containerID"].split("//")[1] - else - # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record["ContainerID"] = "" - end - #keeping this as which is same as InstanceName in perf table - if podUid.nil? || container["name"].nil? - next - else - record["ContainerName"] = podUid + "/" + container["name"] - end - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container["restartCount"] - record["ContainerRestartCount"] = containerRestartCount - - containerStatus = container["state"] - record["ContainerStatusReason"] = "" - # state is of the following form , so just picking up the first key name - # "state": { - # "waiting": { - # "reason": "CrashLoopBackOff", - # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" - # } - # }, - # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running - if podReadyCondition == false - record["ContainerStatus"] = "Unknown" - else - record["ContainerStatus"] = containerStatus.keys[0] - end - #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric - #Picking up both container and node start time from cAdvisor to be consistent - if containerStatus.keys[0] == "running" - record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] - else - if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? - record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] - end - # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm - if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) - end - end - - # Record the last state of the container. This may have information on why a container was killed. - begin - if !container["lastState"].nil? && container["lastState"].keys.length == 1 - lastStateName = container["lastState"].keys[0] - lastStateObject = container["lastState"][lastStateName] - if !lastStateObject.is_a?(Hash) - raise "expected a hash object. This could signify a bug or a kubernetes API change" - end + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + if @CONTAINER_PERF_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + end + kubePerfEventStream = MultiEventStream.new + end - if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - newRecord = Hash.new - newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) - lastStateReason = lastStateObject["reason"] - # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) - newRecord["reason"] = lastStateReason # (ex: OOMKilled) - newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) - lastFinishedTime = lastStateObject["finishedAt"] - newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) - - # only write to the output field if everything previously ran without error - record["ContainerLastStatus"] = newRecord - - #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled - if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - lastStateReason = nil - else - record["ContainerLastStatus"] = Hash.new - end - else - record["ContainerLastStatus"] = Hash.new - end + # container GPU records + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end - #Populate mdm metric for container restart count if greater than 0 - if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - record["ContainerLastStatus"] = Hash.new + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + if @GPU_PERF_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - - podRestartCount += containerRestartCount - records.push(record.dup) - end - else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) - end #container status block end - records.each do |record| - if !record.nil? - record["PodRestartCount"] = podRestartCount - wrapper = { - "DataType" => "KUBE_POD_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) - end - end - # Send container inventory records for containers on windows nodes - @winContainerCount += containerInventoryRecords.length - containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - ciwrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], - } - eventStream.add(emitTime, ciwrapper) if ciwrapper + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream end + insightsMetricsEventStream = MultiEventStream.new end end #podInventory block end - router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + if eventStream.count > 0 + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + if insightsMetricsEventStream.count > 0 + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + if eventStream.count > 0 + if @PODS_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + end + eventStream = nil + end + + if kubePerfEventStream.count > 0 + if @CONTAINER_PERF_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + end + kubePerfEventStream = nil + end + + if insightsMetricsEventStream.count > 0 + if @GPU_PERF_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + end + insightsMetricsEventStream = nil + end if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" @@ -401,101 +335,39 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end - #:optimize:kubeperf merge - begin - #if(!podInventory.empty?) - containerMetricDataItems = [] - #hostName = (OMS::Common.get_hostname) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "cpu", "cpuRequestNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "memory", "memoryRequestBytes", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "cpu", "cpuLimitNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new - - containerMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record - end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - begin - #start GPU InsightsMetrics items - - containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - #:optimize:end kubeperf merge - - #:optimize:start kubeservices merge - begin - if (!serviceList.nil? && !serviceList.empty?) - kubeServicesEventStream = MultiEventStream.new - serviceList["items"].each do |items| - kubeServiceRecord = {} - kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - kubeServiceRecord["ServiceName"] = items["metadata"]["name"] - kubeServiceRecord["Namespace"] = items["metadata"]["namespace"] - kubeServiceRecord["SelectorLabels"] = [items["spec"]["selector"]] + if continuationToken.nil? # sending kube services inventory records + kubeServicesEventStream = MultiEventStream.new + serviceRecords.each do |kubeServiceRecord| + if !kubeServiceRecord.nil? + # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName - kubeServiceRecord["ClusterIP"] = items["spec"]["clusterIP"] - kubeServiceRecord["ServiceType"] = items["spec"]["type"] - # : Add ports and status fields kubeServicewrapper = { "DataType" => "KUBE_SERVICES_BLOB", "IPName" => "ContainerInsights", "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], } kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + if @SERVICES_EMIT_STREAM + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + end + kubeServicesEventStream = MultiEventStream.new + end end + end + + if @SERVICES_EMIT_STREAM && kubeServicesEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubeServices from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + kubeServicesEventStream = nil end - #:optimize:end kubeservices merge #Updating value for AppInsights telemetry @podCount += podInventory["items"].length - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -535,26 +407,230 @@ def run_periodic @mutex.unlock end - def getServiceNameFromLabels(namespace, labels, serviceList) + def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) + records = [] + record = {} + + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = item["metadata"]["name"] + podNameSpace = item["metadata"]["namespace"] + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"]) + if podUid.nil? + return records + end + if KubernetesApiClient.isAROv3MasterOrInfraPod(nodeName) + return records + end + + record["PodUid"] = podUid + record["PodLabel"] = [item["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + #for unscheduled (non-started) pods startTime does NOT exist + if !item["status"]["startTime"].nil? + record["PodStartTime"] = item["status"]["startTime"] + else + record["PodStartTime"] = "" + end + #podStatus + # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running + podReadyCondition = true + if !item["status"]["reason"].nil? && item["status"]["reason"] == "NodeLost" && !item["status"]["conditions"].nil? + item["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" + podReadyCondition = false + break + end + end + end + if podReadyCondition == false + record["PodStatus"] = "Unknown" + # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home + elsif !item["metadata"]["deletionTimestamp"].nil? && !item["metadata"]["deletionTimestamp"].empty? + record["PodStatus"] = Constants::POD_STATUS_TERMINATING + else + record["PodStatus"] = item["status"]["phase"] + end + #for unscheduled (non-started) pods podIP does NOT exist + if !item["status"]["podIP"].nil? + record["PodIp"] = item["status"]["podIP"] + else + record["PodIp"] = "" + end + + record["Computer"] = nodeName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(item["metadata"]["namespace"], item["metadata"]["labels"], serviceRecords) + + if !item["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = item["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = item["metadata"]["ownerReferences"][0]["name"] + @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (@controllerData[record["ControllerKind"]].nil?) + @controllerData[record["ControllerKind"]] = 1 + else + controllerValue = @controllerData[record["ControllerKind"]] + @controllerData[record["ControllerKind"]] += 1 + end + end + podRestartCount = 0 + record["PodRestartCount"] = 0 + + #Invoke the helper method to compute ready/not ready mdm metric + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) + + podContainers = [] + if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? + podContainers = podContainers + item["status"]["containerStatuses"] + end + # Adding init containers to the record list as well. + if item["status"].key?("initContainerStatuses") && !item["status"]["initContainerStatuses"].empty? + podContainers = podContainers + item["status"]["initContainerStatuses"] + end + # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + if !podContainers.empty? #container status block start + podContainers.each do |container| + containerRestartCount = 0 + lastFinishedTime = nil + # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else + # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 + record["ContainerID"] = "" + end + #keeping this as which is same as InstanceName in perf table + if podUid.nil? || container["name"].nil? + next + else + record["ContainerName"] = podUid + "/" + container["name"] + end + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + + containerStatus = container["state"] + record["ContainerStatusReason"] = "" + # state is of the following form , so just picking up the first key name + # "state": { + # "waiting": { + # "reason": "CrashLoopBackOff", + # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" + # } + # }, + # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running + if podReadyCondition == false + record["ContainerStatus"] = "Unknown" + else + record["ContainerStatus"] = containerStatus.keys[0] + end + #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric + #Picking up both container and node start time from cAdvisor to be consistent + if containerStatus.keys[0] == "running" + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] + else + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] + end + # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm + if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) + end + end + + # Record the last state of the container. This may have information on why a container was killed. + begin + if !container["lastState"].nil? && container["lastState"].keys.length == 1 + lastStateName = container["lastState"].keys[0] + lastStateObject = container["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + newRecord = Hash.new + newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) + lastStateReason = lastStateObject["reason"] + # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) + newRecord["reason"] = lastStateReason # (ex: OOMKilled) + newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) + lastFinishedTime = lastStateObject["finishedAt"] + newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) + + # only write to the output field if everything previously ran without error + record["ContainerLastStatus"] = newRecord + + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil + else + record["ContainerLastStatus"] = Hash.new + end + else + record["ContainerLastStatus"] = Hash.new + end + + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + record["ContainerLastStatus"] = Hash.new + end + + podRestartCount += containerRestartCount + records.push(record.dup) + end + else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod + records.push(record) + end #container status block end + + records.each do |record| + if !record.nil? + record["PodRestartCount"] = podRestartCount + end + end + rescue => error + $log.warn("getPodInventoryRecords failed: #{error}") + end + return records + end + + def getServiceNameFromLabels(namespace, labels, serviceRecords) serviceName = "" begin if !labels.nil? && !labels.empty? - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) - serviceList["items"].each do |item| - found = 0 - if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace - selectorLabels = item["spec"]["selector"] - if !selectorLabels.empty? - selectorLabels.each do |key, value| - if !(labels.select { |k, v| k == key && v == value }.length > 0) - break - end - found = found + 1 + serviceRecords.each do |kubeServiceRecord| + found = 0 + if kubeServiceRecord["Namespace"] == namespace + selectorLabels = kubeServiceRecord["SelectorLabels"] + if !selectorLabels.empty? + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) + break end + found = found + 1 end - if found == selectorLabels.length - return item["metadata"]["name"] - end + end + if found == selectorLabels.length + return kubeServiceRecord["ServiceName"] end end end From bede6efb4a818a451142172a59b133669850ab42 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 16:22:34 -0800 Subject: [PATCH 02/45] optimize kube node inventory --- kubernetes/omsagent.yaml | 50 ++ source/plugins/ruby/in_kube_nodes.rb | 476 ++++++++++++-------- source/plugins/ruby/in_kube_podinventory.rb | 33 +- 3 files changed, 365 insertions(+), 194 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 85c383ec2..49a9235de 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -510,6 +510,56 @@ spec: memory: 250Mi env: # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + # pod inventory plugin settings + - name: PODS_CHUNK_SIZE + value: "500" + - name: PODS_EMIT_STREAM_BATCH_SIZE + value: "250" + - name: PODS_EMIT_STREAM + value: "true" + - name: MDM_PODS_INVENTORY_EMIT_STREAM + value: "true" + - name: CONTAINER_PERF_EMIT_STREAM + value: "true" + - name: SERVICES_EMIT_STREAM + value: "true" + - name: GPU_PERF_EMIT_STREAM + value: "true" + + # node inventory plugin settings + - name: NODES_CHUNK_SIZE + value: "200" + - name: NODES_EMIT_STREAM_BATCH_SIZE + value: "100" + - name: NODES_EMIT_STREAM + value: "true" + - name: NODES_PERF_EMIT_STREAM + value: "true" + - name: GPU_NODES_PERF_EMIT_STREAM + value: "true" + - name: CONTAINER_NODE_INVENTORY_EMIT_STREAM + value: "true" + - name: MDM_KUBE_NODE_INVENTORY_EMIT_STREAM + value: "true" + + # event inventory plugin settings + - name: EVENTS_CHUNK_SIZE + value: "30000" + - name: EVENTS_EMIT_STREAM + value: "true" + + # kube state deployments + - name: DEPLOYMENTS_CHUNK_SIZE + value: "1000" + - name: DEPLOYMENTS_EMIT_STREAM + value: "true" + + # kube hpa + - name: HPA_CHUNK_SIZE + value: "2000" + - name: HPA_EMIT_STREAM + value: "true" + - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 4d58382f5..8346a1a2b 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -33,6 +33,13 @@ def initialize require_relative "oms_common" require_relative "omslog" @NODES_CHUNK_SIZE = "400" + # 0 indicates no batch enabled for stream emit + @NODES_EMIT_STREAM_BATCH_SIZE = 0 + @NODES_EMIT_STREAM = true + @NODES_PERF_EMIT_STREAM = true + @GPU_NODES_PERF_EMIT_STREAM = true + @CONTAINER_NODE_INVENTORY_EMIT_STREAM = true + @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM = true require_relative "constants" end @@ -45,6 +52,46 @@ def configure(conf) def start if @run_interval + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"] + end + $log.info("in_kube_nodes::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + + if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? + @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i + NodesChunkSize = @NODES_CHUNK_SIZE.to_i + if @NODES_EMIT_STREAM_BATCH_SIZE > NodesChunkSize + $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE cant be greater than nodes chunksize @ #{@NODES_CHUNK_SIZE}") + @NODES_EMIT_STREAM_BATCH_SIZE = NodesChunkSize + end + end + $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") + + if !ENV["NODES_EMIT_STREAM"].nil? && !ENV["NODES_EMIT_STREAM"].empty? + @NODES_EMIT_STREAM = ENV["NODES_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_nodes::start : NODES_EMIT_STREAM @ #{@NODES_EMIT_STREAM}") + + if !ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].nil? && !ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].empty? + @CONTAINER_NODE_INVENTORY_EMIT_STREAM = ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_nodes::start : CONTAINER_NODE_INVENTORY_EMIT_STREAM @ #{@CONTAINER_NODE_INVENTORY_EMIT_STREAM}") + + if !ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].nil? && !ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].empty? + @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM = ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_nodes::start : MDM_KUBE_NODE_INVENTORY_EMIT_STREAM @ #{@MDM_KUBE_NODE_INVENTORY_EMIT_STREAM}") + + if !ENV["NODES_PERF_EMIT_STREAM"].nil? && !ENV["NODES_PERF_EMIT_STREAM"].empty? + @NODES_PERF_EMIT_STREAM = ENV["NODES_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_nodes::start : NODES_PERF_EMIT_STREAM @ #{@NODES_PERF_EMIT_STREAM}") + + if !ENV["GPU_NODES_PERF_EMIT_STREAM"].nil? && !ENV["GPU_NODES_PERF_EMIT_STREAM"].empty? + @GPU_NODES_PERF_EMIT_STREAM = ENV["GPU_NODES_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_nodes::start : GPU_NODES_PERF_EMIT_STREAM @ #{@GPU_NODES_PERF_EMIT_STREAM}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -109,210 +156,179 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new insightsMetricsEventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" - - if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? - if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack - record["KubernetesProviderID"] = "azurestack" - else - #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here - #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev - provider = items["spec"]["providerID"].split(":")[0] - if !provider.nil? && !provider.empty? - record["KubernetesProviderID"] = provider - else - record["KubernetesProviderID"] = items["spec"]["providerID"] + nodeInventory["items"].each do |item| + # node inventory + nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @NODES_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream end - end - else - record["KubernetesProviderID"] = "onprem" - end - - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] - end + if @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - end - if !allNodeConditions.empty? - record["Status"] = allNodeConditions - end + eventStream = MultiEventStream.new end - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] - if containerRuntimeVersion.downcase.start_with?("docker://") - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - # using containerRuntimeVersion as DockerVersion as is for non docker runtimes - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion - end - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + # container node inventory + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @CONTAINER_NODE_INVENTORY_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + end + containerNodeInventoryEventStream = MultiEventStream.new + end - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 10 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - # DockerVersion field holds docker version if runtime is docker/moby else :// - if containerRuntimeVersion.downcase.start_with?("docker://") - properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - properties["DockerVersion"] = containerRuntimeVersion + # node metrics records + nodeMetricRecords = [] + nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "capacity", "memory", "memoryCapacityBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecords.each do |metricRecord| + metricRecord["DataType"] = "LINUX_PERF_BLOB" + metricRecord["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, metricRecord) if metricRecord + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + end + kubePerfEventStream = MultiEventStream.new end - properties["KubernetesProviderID"] = record["KubernetesProviderID"] - properties["KernelVersion"] = nodeInfo["kernelVersion"] - properties["OSImage"] = nodeInfo["osImage"] - capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + # node GPU metrics record + nodeGPUInsightsMetricsRecords = [] + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @GPU_NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + end + insightsMetricsEventStream = MultiEventStream.new + end + # Adding telemetry to send node telemetry every 10 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + end - begin - if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) - properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] + end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) - properties["amdgpus"] = capacityInfo["amd.com/gpu"] + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents end - rescue => errorStr - $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents - end - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true - end end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + if eventStream.count > 0 + if @NODES_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + end + eventStream = nil end - #:optimize:kubeperf merge - begin - #if(!nodeInventory.empty?) - nodeMetricDataItems = [] - #allocatable metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "memory", "memoryAllocatableBytes", batchTime)) - #capacity metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - - nodeMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record + if kubePerfEventStream.count > 0 + if @NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - #start GPU InsightsMetrics items - begin - nodeGPUInsightsMetricsDataItems = [] - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - end - + kubePerfEventStream = nil + end + if insightsMetricsEventStream.count > 0 + if @GPU_NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + insightsMetricsEventStream = nil end - #:optimize:end kubeperf merge - rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -352,5 +368,107 @@ def run_periodic end @mutex.unlock end + + def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + record = {} + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = item["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + record["Labels"] = [item["metadata"]["labels"]] + record["Status"] = "" + + if !item["spec"]["providerID"].nil? && !item["spec"]["providerID"].empty? + if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack + record["KubernetesProviderID"] = "azurestack" + else + #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here + #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev + provider = item["spec"]["providerID"].split(":")[0] + if !provider.nil? && !provider.empty? + record["KubernetesProviderID"] = provider + else + record["KubernetesProviderID"] = item["spec"]["providerID"] + end + end + else + record["KubernetesProviderID"] = "onprem" + end + + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. + if item["status"].key?("conditions") && !item["status"]["conditions"].empty? + allNodeConditions = "" + item["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end + end + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + end + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end + end + nodeInfo = item["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + rescue => errorStr + $log.warn "in_kube_nodes::getNodeInventoryRecord:Failed: #{errorStr}" + end + return record + end + + def getContainerNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + containerNodeInventoryRecord = {} + begin + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion + end + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeInventoryRecord:Failed: #{errorStr}" + end + return containerNodeInventoryRecord + end + + def getNodeTelemetryProps(item) + properties = {} + begin + properties["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + properties["KubeletVersion"] = nodeInfo["kubeletVersion"] + properties["OperatingSystem"] = nodeInfo["osImage"] + properties["KernelVersion"] = nodeInfo["kernelVersion"] + properties["OSImage"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + properties["DockerVersion"] = containerRuntimeVersion + end + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" + end + return properties + end end # Kube_Node_Input end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 0eead7782..9a20be62d 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -78,6 +78,11 @@ def start if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + PodsChunkSize = @PODS_CHUNK_SIZE.to_i + if @PODS_EMIT_STREAM_BATCH_SIZE > PodsChunkSize + $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE shouldnt be greater than @ #{@PODS_CHUNK_SIZE} ") + @PODS_EMIT_STREAM_BATCH_SIZE = PodsChunkSize + end end $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") @@ -291,20 +296,14 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end end #podInventory block end - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - if eventStream.count > 0 - $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - if insightsMetricsEventStream.count > 0 - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end - if eventStream.count > 0 if @PODS_EMIT_STREAM $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream end + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end eventStream = nil end @@ -321,10 +320,13 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream end + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end insightsMetricsEventStream = nil end - if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send + if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" @@ -348,7 +350,6 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], } kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper - if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE if @SERVICES_EMIT_STREAM $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") @@ -415,15 +416,17 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Name"] = item["metadata"]["name"] podNameSpace = item["metadata"]["namespace"] + podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"]) + if podUid.nil? + return records + end + nodeName = "" #for unscheduled (non-started) pods nodeName does NOT exist if !item["spec"]["nodeName"].nil? nodeName = item["spec"]["nodeName"] end - podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"]) - if podUid.nil? - return records - end + # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROv3MasterOrInfraPod(nodeName) return records end From 9f7759e37334acb6ef43c370107db4ec3d1c2383 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 16:36:21 -0800 Subject: [PATCH 03/45] add flags for events, deployments and hpa --- source/plugins/ruby/in_kube_events.rb | 18 +- .../plugins/ruby/in_kubestate_deployments.rb | 424 +++++++++--------- source/plugins/ruby/in_kubestate_hpa.rb | 423 ++++++++--------- 3 files changed, 448 insertions(+), 417 deletions(-) diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6f59a3fc1..561909246 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -19,6 +19,7 @@ def initialize # 30000 events account to approximately 5MB @EVENTS_CHUNK_SIZE = 30000 + @EVENTS_EMIT_STREAM = true # Initializing events count for telemetry @eventsCount = 0 @@ -36,6 +37,15 @@ def configure(conf) def start if @run_interval + if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? + @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"] + end + $log.info("in_kube_events::start : EVENTS_CHUNK_SIZE @ #{@EVENTS_CHUNK_SIZE}") + + if !ENV["EVENTS_EMIT_STREAM"].nil? && !ENV["EVENTS_EMIT_STREAM"].empty? + @EVENTS_EMIT_STREAM = ENV["EVENTS_EMIT_STREAM"].to_s.downcase == "true" ? true : false + end + $log.info("in_kube_events::start : EVENTS_EMIT_STREAM @ #{@EVENTS_EMIT_STREAM}") @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -82,6 +92,8 @@ def enumerate end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events is eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -91,6 +103,8 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events is eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -156,7 +170,9 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim eventStream.add(emitTime, wrapper) if wrapper @eventsCount += 1 end - router.emit_stream(@tag, eventStream) if eventStream + if @EVENTS_EMIT_STREAM + router.emit_stream(@tag, eventStream) if eventStream + end rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index bcf397150..e1679626d 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -2,230 +2,238 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_Deployments_Input < Input - Plugin.register_input("kubestatedeployments", self) - @@istestvar = ENV["ISTEST"] - # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m - @@deploymentsCount = 0 - - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "date" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each deployment is 8k - # 1000 deployments account to approximately 8MB - @DEPLOYMENTS_CHUNK_SIZE = 1000 - @DEPLOYMENTS_API_GROUP = "apps" - @@telemetryLastSentTime = DateTime.now.to_time.to_i - - - @deploymentsRunningTotal = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_Deployments_Input < Input + Plugin.register_input("kubestatedeployments", self) + @@istestvar = ENV["ISTEST"] + # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m + @@deploymentsCount = 0 + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "date" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # roughly each deployment is 8k + # 1000 deployments account to approximately 8MB + @DEPLOYMENTS_CHUNK_SIZE = 1000 + @DEPLOYMENTS_EMIT_STREAM = true + @DEPLOYMENTS_API_GROUP = "apps" + @@telemetryLastSentTime = DateTime.now.to_time.to_i + + @deploymentsRunningTotal = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? + @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"] end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join + $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") + + if !ENV["DEPLOYMENTS_EMIT_STREAM"].nil? && !ENV["DEPLOYMENTS_EMIT_STREAM"].empty? + @DEPLOYMENTS_EMIT_STREAM = ENV["DEPLOYMENTS_EMIT_STREAM"].to_s.downcase == "true" ? true : false end + $log.info("in_kubestate_deployments::start : DEPLOYMENTS_EMIT_STREAM @ #{@DEPLOYMENTS_EMIT_STREAM}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def enumerate - begin - deploymentList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) - $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + deploymentList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + #set the running total for this batch to 0 + @deploymentsRunningTotal = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) + $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + parse_and_emit_records(deploymentList, batchTime) + else + $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) - if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) - parse_and_emit_records(deploymentList, batchTime) - else - $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" - end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + deploymentList = nil + + $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@deploymentsRunningTotal > @@deploymentsCount) + @@deploymentsCount = @deploymentsRunningTotal + end + if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs) / 60) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES + #send telemetry + $log.info "sending deployemt telemetry..." + ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) + #reset last sent value & time + @@deploymentsCount = 0 + @@telemetryLastSentTime = DateTime.now.to_time.to_i + end + rescue => errorStr + $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = deployments + metricInfo["items"].each do |deployment| + deploymentName = deployment["metadata"]["name"] + deploymentNameSpace = deployment["metadata"]["namespace"] + deploymentCreatedTime = "" + if !deployment["metadata"]["creationTimestamp"].nil? + deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] + end + deploymentStrategy = "RollingUpdate" #default when not specified as per spec + if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? + deploymentStrategy = deployment["spec"]["strategy"]["type"] end - - # Setting this to nil so that we dont hold memory until GC kicks in - deploymentList = nil - - $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@deploymentsRunningTotal > @@deploymentsCount) - @@deploymentsCount = @deploymentsRunningTotal + deploymentSpecReplicas = 1 #default is 1 as per k8s spec + if !deployment["spec"]["replicas"].nil? + deploymentSpecReplicas = deployment["spec"]["replicas"] end - if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs)/60 ) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES - #send telemetry - $log.info "sending deployemt telemetry..." - ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) - #reset last sent value & time - @@deploymentsCount = 0 - @@telemetryLastSentTime = DateTime.now.to_time.to_i + deploymentStatusReadyReplicas = 0 + if !deployment["status"]["readyReplicas"].nil? + deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + deploymentStatusUpToDateReplicas = 0 + if !deployment["status"]["updatedReplicas"].nil? + deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] + end + deploymentStatusAvailableReplicas = 0 + if !deployment["status"]["availableReplicas"].nil? + deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE + metricItem["Value"] = deploymentStatusReadyReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY] = deploymentStrategy + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = deployments - metricInfo["items"].each do |deployment| - deploymentName = deployment["metadata"]["name"] - deploymentNameSpace = deployment["metadata"]["namespace"] - deploymentCreatedTime = "" - if !deployment["metadata"]["creationTimestamp"].nil? - deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] - end - deploymentStrategy = "RollingUpdate" #default when not specified as per spec - if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? - deploymentStrategy = deployment["spec"]["strategy"]["type"] - end - deploymentSpecReplicas = 1 #default is 1 as per k8s spec - if !deployment["spec"]["replicas"].nil? - deploymentSpecReplicas = deployment["spec"]["replicas"] - end - deploymentStatusReadyReplicas = 0 - if !deployment["status"]["readyReplicas"].nil? - deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] - end - deploymentStatusUpToDateReplicas = 0 - if !deployment["status"]["updatedReplicas"].nil? - deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] - end - deploymentStatusAvailableReplicas = 0 - if !deployment["status"]["availableReplicas"].nil? - deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] - end - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE - metricItem["Value"] = deploymentStatusReadyReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY ] = deploymentStrategy - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") - @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") + + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + if @DEPLOYMENTS_EMIT_STREAM + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") + end + @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - + rescue => error + $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 3ce63a75a..ac7d1e853 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -2,231 +2,238 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_HPA_Input < Input - Plugin.register_input("kubestatehpa", self) - @@istestvar = ENV["ISTEST"] - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each HPA is 3k - # 2000 HPAs account to approximately 6-7MB - @HPA_CHUNK_SIZE = 2000 - @HPA_API_GROUP = "autoscaling" - - # telemetry - @hpaCount = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_HPA_Input < Input + Plugin.register_input("kubestatehpa", self) + @@istestvar = ENV["ISTEST"] + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # roughly each HPA is 3k + # 2000 HPAs account to approximately 6-7MB + @HPA_CHUNK_SIZE = 2000 + @HPA_API_GROUP = "autoscaling" + @HPA_EMIT_STREAM = true + + # telemetry + @hpaCount = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? + @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"] end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join + $log.info("in_kubestate_hpa::start : HPA_CHUNK_SIZE @ #{@HPA_CHUNK_SIZE}") + + if !ENV["HPA_EMIT_STREAM"].nil? && !ENV["HPA_EMIT_STREAM"].empty? + @HPA_EMIT_STREAM = ENV["HPA_EMIT_STREAM"].to_s.downcase == "true" ? true : false end + $log.info("in_kubestate_hpa::start : HPA_EMIT_STREAM @ #{@HPA_EMIT_STREAM}") + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def enumerate - begin - hpaList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - @hpaCount = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) - $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + end + + def enumerate + begin + hpaList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + @hpaCount = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) + $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) + parse_and_emit_records(hpaList, batchTime) + else + $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) parse_and_emit_records(hpaList, batchTime) else $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) - if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) - parse_and_emit_records(hpaList, batchTime) - else - $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + # Setting this to nil so that we dont hold memory until GC kicks in + hpaList = nil + + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@hpaCount > 0) + # this will not be a useful telemetry, as hpa counts will not be huge, just log for now + $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") + #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + end + rescue => errorStr + $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = hpas + metricInfo["items"].each do |hpa| + hpaName = hpa["metadata"]["name"] + hpaNameSpace = hpa["metadata"]["namespace"] + hpaCreatedTime = "" + if !hpa["metadata"]["creationTimestamp"].nil? + hpaCreatedTime = hpa["metadata"]["creationTimestamp"] + end + hpaSpecMinReplicas = 1 #default is 1 as per k8s spec + if !hpa["spec"]["minReplicas"].nil? + hpaSpecMinReplicas = hpa["spec"]["minReplicas"] + end + hpaSpecMaxReplicas = 0 + if !hpa["spec"]["maxReplicas"].nil? + hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] + end + hpaSpecScaleTargetKind = "" + hpaSpecScaleTargetName = "" + if !hpa["spec"]["scaleTargetRef"].nil? + if !hpa["spec"]["scaleTargetRef"]["kind"].nil? + hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] + end + if !hpa["spec"]["scaleTargetRef"]["name"].nil? + hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] end end - - # Setting this to nil so that we dont hold memory until GC kicks in - hpaList = nil - - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@hpaCount > 0) - # this will not be a useful telemetry, as hpa counts will not be huge, just log for now - $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") - #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + hpaStatusCurrentReplicas = 0 + if !hpa["status"]["currentReplicas"].nil? + hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + hpaStatusDesiredReplicas = 0 + if !hpa["status"]["desiredReplicas"].nil? + hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] + end + + hpaStatuslastScaleTime = "" + if !hpa["status"]["lastScaleTime"].nil? + hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE + metricItem["Value"] = hpaStatusCurrentReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = hpas - metricInfo["items"].each do |hpa| - hpaName = hpa["metadata"]["name"] - hpaNameSpace = hpa["metadata"]["namespace"] - hpaCreatedTime = "" - if !hpa["metadata"]["creationTimestamp"].nil? - hpaCreatedTime = hpa["metadata"]["creationTimestamp"] - end - hpaSpecMinReplicas = 1 #default is 1 as per k8s spec - if !hpa["spec"]["minReplicas"].nil? - hpaSpecMinReplicas = hpa["spec"]["minReplicas"] - end - hpaSpecMaxReplicas = 0 - if !hpa["spec"]["maxReplicas"].nil? - hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] - end - hpaSpecScaleTargetKind = "" - hpaSpecScaleTargetName = "" - if !hpa["spec"]["scaleTargetRef"].nil? - if !hpa["spec"]["scaleTargetRef"]["kind"].nil? - hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] - end - if !hpa["spec"]["scaleTargetRef"]["name"].nil? - hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] - end - - end - hpaStatusCurrentReplicas = 0 - if !hpa["status"]["currentReplicas"].nil? - hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] - end - hpaStatusDesiredReplicas = 0 - if !hpa["status"]["desiredReplicas"].nil? - hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] - end - - hpaStatuslastScaleTime = "" - if !hpa["status"]["lastScaleTime"].nil? - hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] - end - - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE - metricItem["Value"] = hpaStatusCurrentReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + if @HPA_EMIT_STREAM + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") end - + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => error + $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end From 6073fed8d0c32e4607519ac6075457beb1b4ca49 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 16:46:06 -0800 Subject: [PATCH 04/45] have separate function parseNodeLimits --- source/plugins/ruby/KubernetesApiClient.rb | 49 ++++++++++++++++++++-- source/plugins/ruby/in_kube_nodes.rb | 8 ++-- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 13c084a5c..893fd438f 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -569,7 +569,50 @@ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, return metricItems end #getContainerResourceRequestAndLimitsAsInsightsMetrics - def parseNodeLimits(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end + end + rescue => error + @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end #parseNodeLimits + + def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItem = {} begin clusterId = getClusterId @@ -602,10 +645,10 @@ def parseNodeLimits(node, metricCategory, metricNameToCollect, metricNametoRetur #@Log.info ("Node metric hash: #{@@NodeMetrics}") end rescue => error - @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") end return metricItem - end #parseNodeLimits + end #parseNodeLimitsFromNodeItem def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItem = {} diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 8346a1a2b..e9c00a642 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -202,19 +202,19 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) # node metrics records nodeMetricRecords = [] - nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) end - nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) end - nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) end - nodeMetricRecord = KubernetesApiClient.parseNodeLimits(item, "capacity", "memory", "memoryCapacityBytes", batchTime) + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) end From 97f55f75d7c60b7ebf01659754e215b2e6090b85 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 16:53:18 -0800 Subject: [PATCH 05/45] refactor code --- source/plugins/ruby/KubernetesApiClient.rb | 27 ++-------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 893fd438f..eb2aa3425 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -578,32 +578,9 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem = parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime) + if !metricItem.nil? && !metricItem.empty? metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error From abc28c27142959a10832aacb138f73a85a9935db Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 17:41:14 -0800 Subject: [PATCH 06/45] fix crash --- source/plugins/ruby/in_kube_nodes.rb | 257 ++++++++++---------- source/plugins/ruby/in_kube_podinventory.rb | 5 - 2 files changed, 126 insertions(+), 136 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index e9c00a642..89a2ff7ba 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -59,11 +59,6 @@ def start if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i - NodesChunkSize = @NODES_CHUNK_SIZE.to_i - if @NODES_EMIT_STREAM_BATCH_SIZE > NodesChunkSize - $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE cant be greater than nodes chunksize @ #{@NODES_CHUNK_SIZE}") - @NODES_EMIT_STREAM_BATCH_SIZE = NodesChunkSize - end end $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") @@ -163,147 +158,147 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @NODES_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@tag, eventStream) if eventStream - end - if @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - end - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - eventStream = MultiEventStream.new + if @NODES_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + end + if @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + end + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + eventStream = MultiEventStream.new end - # container node inventory - containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + # container node inventory + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @CONTAINER_NODE_INVENTORY_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - end - containerNodeInventoryEventStream = MultiEventStream.new + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @CONTAINER_NODE_INVENTORY_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream end + containerNodeInventoryEventStream = MultiEventStream.new + end - # node metrics records - nodeMetricRecords = [] - nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) - if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? - nodeMetricRecords.push(nodeMetricRecord) - end - nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) - if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? - nodeMetricRecords.push(nodeMetricRecord) - end - nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) - if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? - nodeMetricRecords.push(nodeMetricRecord) - end - nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) - if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? - nodeMetricRecords.push(nodeMetricRecord) - end - nodeMetricRecords.each do |metricRecord| - metricRecord["DataType"] = "LINUX_PERF_BLOB" - metricRecord["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, metricRecord) if metricRecord - end - if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - end - kubePerfEventStream = MultiEventStream.new + # node metrics records + nodeMetricRecords = [] + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecords.each do |metricRecord| + metricRecord["DataType"] = "LINUX_PERF_BLOB" + metricRecord["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, metricRecord) if metricRecord + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream end + kubePerfEventStream = MultiEventStream.new + end - # node GPU metrics record - nodeGPUInsightsMetricsRecords = [] - insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime) - if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? - nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) - end - insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime) - if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? - nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) - end - insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime) - if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? - nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) - end - insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime) - if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? - nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) - end - nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - end - if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @GPU_NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - end - insightsMetricsEventStream = MultiEventStream.new + # node GPU metrics record + nodeGPUInsightsMetricsRecords = [] + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + if @GPU_NODES_PERF_EMIT_STREAM + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream end - # Adding telemetry to send node telemetry every 10 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - properties = getNodeTelemetryProps(item) - properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] - capacityInfo = item["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - begin - if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) - properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] - end - - if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) - properties["amdgpus"] = capacityInfo["amd.com/gpu"] - end - rescue => errorStr - $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + insightsMetricsEventStream = MultiEventStream.new + end + # Adding telemetry to send node telemetry every 10 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] end - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true + end end if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -468,7 +463,7 @@ def getNodeTelemetryProps(item) rescue => errorStr $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end - return properties + return properties end end # Kube_Node_Input end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 9a20be62d..13903cd4a 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -78,11 +78,6 @@ def start if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i - PodsChunkSize = @PODS_CHUNK_SIZE.to_i - if @PODS_EMIT_STREAM_BATCH_SIZE > PodsChunkSize - $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE shouldnt be greater than @ #{@PODS_CHUNK_SIZE} ") - @PODS_EMIT_STREAM_BATCH_SIZE = PodsChunkSize - end end $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") From 259a95c30759f034f1f61a1623913a3020367822 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 18:45:12 -0800 Subject: [PATCH 07/45] fix bug with service name --- source/plugins/ruby/in_kube_podinventory.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 13903cd4a..00b721424 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -122,7 +122,7 @@ def enumerate(podList = nil) serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInfo = nil - # service inventory records much smaller size and fixed compared to serviceList + # service inventory records much smaller and fixed size compared to serviceList serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) serviceList = nil end @@ -618,7 +618,11 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) serviceRecords.each do |kubeServiceRecord| found = 0 if kubeServiceRecord["Namespace"] == namespace - selectorLabels = kubeServiceRecord["SelectorLabels"] + selectorLabels = {} + # selector labels wrapped in array in kube service records so unwrapping here + if !kubeServiceRecord["SelectorLabels"].nil? && kubeServiceRecord["SelectorLabels"].length > 0 + selectorLabels = kubeServiceRecord["SelectorLabels"][0] + end if !selectorLabels.empty? selectorLabels.each do |key, value| if !(labels.select { |k, v| k == key && v == value }.length > 0) From b37529b382fba256ca62e49e243a892d2dc09e51 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 20:34:32 -0800 Subject: [PATCH 08/45] fix bugs related to get service name --- source/plugins/ruby/in_kube_nodes.rb | 8 ++++++++ source/plugins/ruby/in_kube_podinventory.rb | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 89a2ff7ba..c36042927 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -310,6 +310,14 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end eventStream = nil end + if containerNodeInventoryEventStream.count > 0 + if @CONTAINER_NODE_INVENTORY_EMIT_STREAM + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + end + containerNodeInventoryEventStream = nil + end + if kubePerfEventStream.count > 0 if @NODES_PERF_EMIT_STREAM $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 00b721424..702b0f130 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -623,7 +623,7 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) if !kubeServiceRecord["SelectorLabels"].nil? && kubeServiceRecord["SelectorLabels"].length > 0 selectorLabels = kubeServiceRecord["SelectorLabels"][0] end - if !selectorLabels.empty? + if !selectorLabels.nil? && !selectorLabels.empty? selectorLabels.each do |key, value| if !(labels.select { |k, v| k == key && v == value }.length > 0) break @@ -631,7 +631,8 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) found = found + 1 end end - if found == selectorLabels.length + # service can have no selectors to avoid mapping to wrong service check found > 0 + if found > 0 && found == selectorLabels.length return kubeServiceRecord["ServiceName"] end end From 7375e3388a7db18502a7e248e390495d592ae062 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 29 Nov 2020 22:48:24 -0800 Subject: [PATCH 09/45] update oom fix test agent --- kubernetes/omsagent.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 49a9235de..fc95d49f3 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020" imagePullPolicy: IfNotPresent resources: limits: From ed0857b77cb56d4658f1a7184129e00c154d8534 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 06:43:30 -0800 Subject: [PATCH 10/45] debug logs --- source/plugins/ruby/in_kube_events.rb | 8 ++++++-- source/plugins/ruby/in_kube_nodes.rb | 6 ++++++ source/plugins/ruby/in_kube_podinventory.rb | 12 ++++++++++-- source/plugins/ruby/in_kubestate_deployments.rb | 6 ++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 561909246..4f532ff52 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -92,8 +92,10 @@ def enumerate end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + # debug logs to track the payload size eventsCount = eventList["items"].length - $log.info "in_kube_events::enumerate:Received number of events is eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" + eventsInventorySizeInKB = (eventList.to_s.length) / 1024 + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} and size in KB #{eventsInventorySizeInKB} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -103,8 +105,10 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + # debug logs to track the payload size eventsCount = eventList["items"].length - $log.info "in_kube_events::enumerate:Received number of events is eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" + eventsInventorySizeInKB = (eventList.to_s.length) / 1024 + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} and size in KB #{eventsInventorySizeInKB} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c36042927..f357da87e 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -119,6 +119,9 @@ def enumerate $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + # debug logs to track the payload size + nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} and size in KB: #{nodeInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" @@ -128,6 +131,9 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + # debug logs to track the payload size + nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} and size in KB: #{nodeInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 702b0f130..a70f1aab6 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -118,9 +118,11 @@ def enumerate(podList = nil) $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") if !serviceInfo.nil? - $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + # debug logs to track the payload size + serviceInfoResponseSizeInKB = (serviceInfo.body.length) / 1024 + $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl serviceInfo size in KB #{serviceInfoResponseSizeInKB} @ #{Time.now.utc.iso8601}") serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl serviceInfo size in KB #{serviceInfoResponseSizeInKB} @ #{Time.now.utc.iso8601}") serviceInfo = nil # service inventory records much smaller and fixed size compared to serviceList serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) @@ -133,6 +135,9 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + # debug logs to track the payload size + podInventorySizeInKB = (podInventory.to_s.length) / 1024 + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} and size in KB: #{podInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" @@ -142,6 +147,9 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + # debug logs to track the payload size + podInventorySizeInKB = (podInventory.to_s.length) / 1024 + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} and size in KB: #{podInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index e1679626d..cd9f279e1 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -86,6 +86,9 @@ def enumerate continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + # debug logs to track the payload size + deploymentsSizeInKB = (deploymentList.to_s.length) / 1024 + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} and size in KB: #{deploymentsSizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" @@ -95,6 +98,9 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + # debug logs to track the payload size + deploymentsSizeInKB = (deploymentList.to_s.length) / 1024 + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} and size in KB: #{deploymentsSizeInKB} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" From b69f0320ef09e970aeb6645a8dafdacdd903c8e8 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 08:05:24 -0800 Subject: [PATCH 11/45] fix service label issue --- source/plugins/ruby/in_kube_podinventory.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index a70f1aab6..7a25dbdc4 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -638,10 +638,10 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) end found = found + 1 end - end - # service can have no selectors to avoid mapping to wrong service check found > 0 - if found > 0 && found == selectorLabels.length - return kubeServiceRecord["ServiceName"] + # service can have no selectors + if found == selectorLabels.length + return kubeServiceRecord["ServiceName"] + end end end end From 2eeaed4e8d3c7472978a77bb98cf010c2e5f5cd9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 09:15:51 -0800 Subject: [PATCH 12/45] update to latest agent and enable ephemeral annotation --- kubernetes/omsagent.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index fc95d49f3..893432dad 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-2" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-2" imagePullPolicy: IfNotPresent resources: limits: @@ -615,14 +615,14 @@ spec: affinity: nodeAffinity: # affinity to schedule on to ephemeral os node if its available - # preferredDuringSchedulingIgnoredDuringExecution: - # - weight: 1 - # preference: - # matchExpressions: - # - key: storageprofile - # operator: NotIn - # values: - # - managed + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: From 10e4b71e9408908dd1b60b0148b1b4602383fa2a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 10:54:43 -0800 Subject: [PATCH 13/45] change stream size to 200 from 250 --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 893432dad..3310cda66 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -514,7 +514,7 @@ spec: - name: PODS_CHUNK_SIZE value: "500" - name: PODS_EMIT_STREAM_BATCH_SIZE - value: "250" + value: "200" - name: PODS_EMIT_STREAM value: "true" - name: MDM_PODS_INVENTORY_EMIT_STREAM From d003daa1d0eeb7439467be08cb25e5b042e95b7d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 15:29:08 -0800 Subject: [PATCH 14/45] update yaml --- kubernetes/omsagent.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 3310cda66..e03455fdb 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -544,7 +544,7 @@ spec: # event inventory plugin settings - name: EVENTS_CHUNK_SIZE - value: "30000" + value: "3000" - name: EVENTS_EMIT_STREAM value: "true" @@ -614,8 +614,8 @@ spec: periodSeconds: 60 affinity: nodeAffinity: - # affinity to schedule on to ephemeral os node if its available - preferredDuringSchedulingIgnoredDuringExecution: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: matchExpressions: From 0ba06108eb445dcf8eec341b8fe196798774d89d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 19:09:07 -0800 Subject: [PATCH 15/45] adjust chunksizes --- source/plugins/ruby/in_kube_events.rb | 13 +-- source/plugins/ruby/in_kube_nodes.rb | 88 +++++-------------- source/plugins/ruby/in_kube_podinventory.rb | 80 +++++------------ .../plugins/ruby/in_kubestate_deployments.rb | 17 ++-- source/plugins/ruby/in_kubestate_hpa.rb | 11 +-- 5 files changed, 52 insertions(+), 157 deletions(-) diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 4f532ff52..6cea5e996 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -17,9 +17,8 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" - # 30000 events account to approximately 5MB - @EVENTS_CHUNK_SIZE = 30000 - @EVENTS_EMIT_STREAM = true + # 4000 events (1KB per event) account to approximately 4MB + @EVENTS_CHUNK_SIZE = 4000 # Initializing events count for telemetry @eventsCount = 0 @@ -42,10 +41,6 @@ def start end $log.info("in_kube_events::start : EVENTS_CHUNK_SIZE @ #{@EVENTS_CHUNK_SIZE}") - if !ENV["EVENTS_EMIT_STREAM"].nil? && !ENV["EVENTS_EMIT_STREAM"].empty? - @EVENTS_EMIT_STREAM = ENV["EVENTS_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_events::start : EVENTS_EMIT_STREAM @ #{@EVENTS_EMIT_STREAM}") @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -174,9 +169,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim eventStream.add(emitTime, wrapper) if wrapper @eventsCount += 1 end - if @EVENTS_EMIT_STREAM - router.emit_stream(@tag, eventStream) if eventStream - end + router.emit_stream(@tag, eventStream) if eventStream rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index f357da87e..c77db2791 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -32,14 +32,10 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" - @NODES_CHUNK_SIZE = "400" + # 250 Node items (15KB per node) account to approximately 4MB + @NODES_CHUNK_SIZE = "250" # 0 indicates no batch enabled for stream emit @NODES_EMIT_STREAM_BATCH_SIZE = 0 - @NODES_EMIT_STREAM = true - @NODES_PERF_EMIT_STREAM = true - @GPU_NODES_PERF_EMIT_STREAM = true - @CONTAINER_NODE_INVENTORY_EMIT_STREAM = true - @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM = true require_relative "constants" end @@ -62,31 +58,6 @@ def start end $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") - if !ENV["NODES_EMIT_STREAM"].nil? && !ENV["NODES_EMIT_STREAM"].empty? - @NODES_EMIT_STREAM = ENV["NODES_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_nodes::start : NODES_EMIT_STREAM @ #{@NODES_EMIT_STREAM}") - - if !ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].nil? && !ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].empty? - @CONTAINER_NODE_INVENTORY_EMIT_STREAM = ENV["CONTAINER_NODE_INVENTORY_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_nodes::start : CONTAINER_NODE_INVENTORY_EMIT_STREAM @ #{@CONTAINER_NODE_INVENTORY_EMIT_STREAM}") - - if !ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].nil? && !ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].empty? - @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM = ENV["MDM_KUBE_NODE_INVENTORY_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_nodes::start : MDM_KUBE_NODE_INVENTORY_EMIT_STREAM @ #{@MDM_KUBE_NODE_INVENTORY_EMIT_STREAM}") - - if !ENV["NODES_PERF_EMIT_STREAM"].nil? && !ENV["NODES_PERF_EMIT_STREAM"].empty? - @NODES_PERF_EMIT_STREAM = ENV["NODES_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_nodes::start : NODES_PERF_EMIT_STREAM @ #{@NODES_PERF_EMIT_STREAM}") - - if !ENV["GPU_NODES_PERF_EMIT_STREAM"].nil? && !ENV["GPU_NODES_PERF_EMIT_STREAM"].empty? - @GPU_NODES_PERF_EMIT_STREAM = ENV["GPU_NODES_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_nodes::start : GPU_NODES_PERF_EMIT_STREAM @ #{@GPU_NODES_PERF_EMIT_STREAM}") - @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -170,14 +141,11 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) } eventStream.add(emitTime, wrapper) if wrapper if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @NODES_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@tag, eventStream) if eventStream - end - if @MDM_KUBE_NODE_INVENTORY_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - end + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -194,10 +162,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @CONTAINER_NODE_INVENTORY_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - end + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = MultiEventStream.new end @@ -225,10 +191,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - end + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = MultiEventStream.new end @@ -259,10 +223,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(emitTime, wrapper) if wrapper end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - if @GPU_NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - end + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = MultiEventStream.new end # Adding telemetry to send node telemetry every 10 minutes @@ -310,32 +272,24 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end if eventStream.count > 0 - if @NODES_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@tag, eventStream) if eventStream - end + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream eventStream = nil end if containerNodeInventoryEventStream.count > 0 - if @CONTAINER_NODE_INVENTORY_EMIT_STREAM - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - end + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = nil end if kubePerfEventStream.count > 0 - if @NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - end + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil end if insightsMetricsEventStream.count > 0 - if @GPU_NODES_PERF_EMIT_STREAM - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - end + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = nil end rescue => errorStr diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 7a25dbdc4..07655c22c 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -27,12 +27,8 @@ def initialize require_relative "omslog" require_relative "constants" - @PODS_EMIT_STREAM = true - @CONTAINER_PERF_EMIT_STREAM = true - @GPU_PERF_EMIT_STREAM = true - @SERVICES_EMIT_STREAM = true - - @PODS_CHUNK_SIZE = "1500" + # 500 pod (10KB per pod) account to approximately 5MB + @PODS_CHUNK_SIZE = "500" @podCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @@ -51,26 +47,6 @@ def configure(conf) def start if @run_interval - if !ENV["PODS_EMIT_STREAM"].nil? && !ENV["PODS_EMIT_STREAM"].empty? - @PODS_EMIT_STREAM = ENV["PODS_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM @ #{@PODS_EMIT_STREAM}") - - if !ENV["SERVICES_EMIT_STREAM"].nil? && !ENV["SERVICES_EMIT_STREAM"].empty? - @SERVICES_EMIT_STREAM = ENV["SERVICES_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_podinventory::start : SERVICES_EMIT_STREAM @ #{@SERVICES_EMIT_STREAM}") - - if !ENV["CONTAINER_PERF_EMIT_STREAM"].nil? && !ENV["CONTAINER_PERF_EMIT_STREAM"].empty? - @CONTAINER_PERF_EMIT_STREAM = ENV["CONTAINER_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_podinventory::start : CONTAINER_PERF_EMIT_STREAM @ #{@CONTAINER_PERF_EMIT_STREAM}") - - if !ENV["GPU_PERF_EMIT_STREAM"].nil? && !ENV["GPU_PERF_EMIT_STREAM"].empty? - @GPU_PERF_EMIT_STREAM = ENV["GPU_PERF_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kube_podinventory::start : GPU_PERF_EMIT_STREAM @ #{@GPU_PERF_EMIT_STREAM}") - if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"] end @@ -241,13 +217,11 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - if @PODS_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(@tag, eventStream) if eventStream eventStream = MultiEventStream.new end @@ -265,10 +239,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - if @CONTAINER_PERF_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - end + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = MultiEventStream.new end @@ -288,22 +260,18 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - if @GPU_PERF_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = MultiEventStream.new end end #podInventory block end if eventStream.count > 0 - if @PODS_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@tag, eventStream) if eventStream - end + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -311,18 +279,14 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if kubePerfEventStream.count > 0 - if @CONTAINER_PERF_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - end + $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil end if insightsMetricsEventStream.count > 0 - if @GPU_PERF_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - end + $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -354,16 +318,14 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc } kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - if @SERVICES_EMIT_STREAM - $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream - end + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream kubeServicesEventStream = MultiEventStream.new end end end - if @SERVICES_EMIT_STREAM && kubeServicesEventStream.count > 0 + if kubeServicesEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream end diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index cd9f279e1..ffbe8ae4f 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -22,9 +22,8 @@ def initialize require_relative "constants" # roughly each deployment is 8k - # 1000 deployments account to approximately 8MB - @DEPLOYMENTS_CHUNK_SIZE = 1000 - @DEPLOYMENTS_EMIT_STREAM = true + # 500 deployments account to approximately 4MB + @DEPLOYMENTS_CHUNK_SIZE = 500 @DEPLOYMENTS_API_GROUP = "apps" @@telemetryLastSentTime = DateTime.now.to_time.to_i @@ -49,11 +48,6 @@ def start end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") - if !ENV["DEPLOYMENTS_EMIT_STREAM"].nil? && !ENV["DEPLOYMENTS_EMIT_STREAM"].empty? - @DEPLOYMENTS_EMIT_STREAM = ENV["DEPLOYMENTS_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kubestate_deployments::start : DEPLOYMENTS_EMIT_STREAM @ #{@DEPLOYMENTS_EMIT_STREAM}") - @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -196,10 +190,9 @@ def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(time, wrapper) if wrapper end - if @DEPLOYMENTS_EMIT_STREAM - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") - end + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") + @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index ac7d1e853..736f17250 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -22,7 +22,6 @@ def initialize # 2000 HPAs account to approximately 6-7MB @HPA_CHUNK_SIZE = 2000 @HPA_API_GROUP = "autoscaling" - @HPA_EMIT_STREAM = true # telemetry @hpaCount = 0 @@ -46,10 +45,6 @@ def start end $log.info("in_kubestate_hpa::start : HPA_CHUNK_SIZE @ #{@HPA_CHUNK_SIZE}") - if !ENV["HPA_EMIT_STREAM"].nil? && !ENV["HPA_EMIT_STREAM"].empty? - @HPA_EMIT_STREAM = ENV["HPA_EMIT_STREAM"].to_s.downcase == "true" ? true : false - end - $log.info("in_kubestate_hpa::start : HPA_EMIT_STREAM @ #{@HPA_EMIT_STREAM}") @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -191,10 +186,8 @@ def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(time, wrapper) if wrapper end - if @HPA_EMIT_STREAM - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") - end + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end From 43975d9689b959af758a14381de3e895641351d2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 20:26:40 -0800 Subject: [PATCH 16/45] add ruby gc env --- kubernetes/linux/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index d04e86128..34ab133da 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -15,6 +15,7 @@ ENV HOST_VAR /hostfs/var ENV AZMON_COLLECT_ENV False ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 +ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} From 2b8660ba2552d2968e1aba2b86dafb5b9555d817 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 30 Nov 2020 22:17:37 -0800 Subject: [PATCH 17/45] yaml changes for cioomtest11282020-3 --- kubernetes/omsagent.yaml | 62 ++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e03455fdb..21df718bb 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-2" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-3" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-2" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-3" imagePullPolicy: IfNotPresent resources: limits: @@ -515,50 +515,50 @@ spec: value: "500" - name: PODS_EMIT_STREAM_BATCH_SIZE value: "200" - - name: PODS_EMIT_STREAM - value: "true" - - name: MDM_PODS_INVENTORY_EMIT_STREAM - value: "true" - - name: CONTAINER_PERF_EMIT_STREAM - value: "true" - - name: SERVICES_EMIT_STREAM - value: "true" - - name: GPU_PERF_EMIT_STREAM - value: "true" + # - name: PODS_EMIT_STREAM + # value: "true" + # - name: MDM_PODS_INVENTORY_EMIT_STREAM + # value: "true" + # - name: CONTAINER_PERF_EMIT_STREAM + # value: "true" + # - name: SERVICES_EMIT_STREAM + # value: "true" + # - name: GPU_PERF_EMIT_STREAM + # value: "true" # node inventory plugin settings - name: NODES_CHUNK_SIZE - value: "200" + value: "250" - name: NODES_EMIT_STREAM_BATCH_SIZE value: "100" - - name: NODES_EMIT_STREAM - value: "true" - - name: NODES_PERF_EMIT_STREAM - value: "true" - - name: GPU_NODES_PERF_EMIT_STREAM - value: "true" - - name: CONTAINER_NODE_INVENTORY_EMIT_STREAM - value: "true" - - name: MDM_KUBE_NODE_INVENTORY_EMIT_STREAM - value: "true" + # - name: NODES_EMIT_STREAM + # value: "true" + # - name: NODES_PERF_EMIT_STREAM + # value: "true" + # - name: GPU_NODES_PERF_EMIT_STREAM + # value: "true" + # - name: CONTAINER_NODE_INVENTORY_EMIT_STREAM + # value: "true" + # - name: MDM_KUBE_NODE_INVENTORY_EMIT_STREAM + # value: "true" # event inventory plugin settings - name: EVENTS_CHUNK_SIZE - value: "3000" - - name: EVENTS_EMIT_STREAM - value: "true" + value: "4000" + # - name: EVENTS_EMIT_STREAM + # value: "true" # kube state deployments - name: DEPLOYMENTS_CHUNK_SIZE - value: "1000" - - name: DEPLOYMENTS_EMIT_STREAM - value: "true" + value: "500" + # - name: DEPLOYMENTS_EMIT_STREAM + # value: "true" # kube hpa - name: HPA_CHUNK_SIZE value: "2000" - - name: HPA_EMIT_STREAM - value: "true" + # - name: HPA_EMIT_STREAM + # value: "true" - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" From 8e378faf319bee9fb89592e026a8c784133ec17e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 12:52:23 -0800 Subject: [PATCH 18/45] telemetry to track pods latency --- source/plugins/ruby/in_kube_podinventory.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 07655c22c..af3506f0d 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -35,6 +35,8 @@ def initialize @controllerData = {} # 0 indicates no batch enabled for stream emit @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @podInventoryE2EProcessingLatencyInMillis = 0 + @podsAPIE2ELatencyInMillis = 0 end config_param :run_interval, :time, :default => 60 @@ -86,6 +88,7 @@ def enumerate(podList = nil) currentTime = Time.now batchTime = currentTime.utc.iso8601 serviceRecords = [] + @podInventoryE2EProcessingLatencyInMillis = 0 # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") @@ -105,11 +108,17 @@ def enumerate(podList = nil) serviceList = nil end + # to track e2e processing latency + @podsAPIE2ELatencyInMillis = 0 + startTime = (Time.now.to_f * 1000).to_i + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyInMillis = (podsAPIChunkEndTime - podsAPIStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -121,7 +130,10 @@ def enumerate(podList = nil) #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyInMillis = @podsAPIE2ELatencyInMillis + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -132,6 +144,8 @@ def enumerate(podList = nil) end end + endTime = (Time.now.to_f * 1000).to_i + @podInventoryE2EProcessingLatencyInMillis = endTime - startTime # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil @@ -147,6 +161,8 @@ def enumerate(podList = nil) if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName + telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE + telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json @@ -155,6 +171,8 @@ def enumerate(podList = nil) telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end + ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyInMillis", @podInventoryE2EProcessingLatencyInMillis, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyInMillis", @podsAPIE2ELatencyInMillis, telemetryProperties) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr From fb56ab069d2d7285bc23e1c7f523434981d9ea17 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 13:02:57 -0800 Subject: [PATCH 19/45] service count telemetry --- source/plugins/ruby/in_kube_podinventory.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index af3506f0d..765238c16 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -30,6 +30,7 @@ def initialize # 500 pod (10KB per pod) account to approximately 5MB @PODS_CHUNK_SIZE = "500" @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} @@ -82,6 +83,7 @@ def enumerate(podList = nil) podInventory = podList telemetryFlush = false @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} @@ -105,6 +107,8 @@ def enumerate(podList = nil) serviceInfo = nil # service inventory records much smaller and fixed size compared to serviceList serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) + # updating for telemetry + @serviceCount += serviceRecords.length serviceList = nil end @@ -165,6 +169,7 @@ def enumerate(podList = nil) telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 From e9541eafb88de8023afccba30548733abbc2de4c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 13:17:23 -0800 Subject: [PATCH 20/45] rename variables --- source/plugins/ruby/in_kube_podinventory.rb | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 765238c16..d94501bba 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -36,8 +36,8 @@ def initialize @controllerData = {} # 0 indicates no batch enabled for stream emit @PODS_EMIT_STREAM_BATCH_SIZE = 0 - @podInventoryE2EProcessingLatencyInMillis = 0 - @podsAPIE2ELatencyInMillis = 0 + @podInventoryE2EProcessingLatencyMs = 0 + @podsAPIE2ELatencyInMs = 0 end config_param :run_interval, :time, :default => 60 @@ -90,7 +90,7 @@ def enumerate(podList = nil) currentTime = Time.now batchTime = currentTime.utc.iso8601 serviceRecords = [] - @podInventoryE2EProcessingLatencyInMillis = 0 + @podInventoryE2EProcessingLatencyMs = 0 # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") @@ -113,8 +113,8 @@ def enumerate(podList = nil) end # to track e2e processing latency - @podsAPIE2ELatencyInMillis = 0 - startTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyInMs = 0 + podInventoryStartTime = (Time.now.to_f * 1000).to_i podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil @@ -122,7 +122,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyInMillis = (podsAPIChunkEndTime - podsAPIStartTime) + @podsAPIE2ELatencyInMs = (podsAPIChunkEndTime - podsAPIStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -137,7 +137,7 @@ def enumerate(podList = nil) podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyInMillis = @podsAPIE2ELatencyInMillis + (podsAPIChunkEndTime - podsAPIChunkStartTime) + @podsAPIE2ELatencyInMs = @podsAPIE2ELatencyInMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -148,8 +148,7 @@ def enumerate(podList = nil) end end - endTime = (Time.now.to_f * 1000).to_i - @podInventoryE2EProcessingLatencyInMillis = endTime - startTime + @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil serviceRecords = nil @@ -176,8 +175,8 @@ def enumerate(podList = nil) telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end - ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyInMillis", @podInventoryE2EProcessingLatencyInMillis, telemetryProperties) - ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyInMillis", @podsAPIE2ELatencyInMillis, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyInMs", @podsAPIE2ELatencyInMs, telemetryProperties) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr From 023a7cbb7c29f729f8add2c172223690b0cac4c7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 13:30:31 -0800 Subject: [PATCH 21/45] wip --- source/plugins/ruby/in_kube_podinventory.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index d94501bba..2e920e7c4 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -37,7 +37,7 @@ def initialize # 0 indicates no batch enabled for stream emit @PODS_EMIT_STREAM_BATCH_SIZE = 0 @podInventoryE2EProcessingLatencyMs = 0 - @podsAPIE2ELatencyInMs = 0 + @podsAPIE2ELatencyMs = 0 end config_param :run_interval, :time, :default => 60 @@ -113,7 +113,7 @@ def enumerate(podList = nil) end # to track e2e processing latency - @podsAPIE2ELatencyInMs = 0 + @podsAPIE2ELatencyMs = 0 podInventoryStartTime = (Time.now.to_f * 1000).to_i podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil @@ -122,7 +122,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyInMs = (podsAPIChunkEndTime - podsAPIStartTime) + @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -137,7 +137,7 @@ def enumerate(podList = nil) podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i - @podsAPIE2ELatencyInMs = @podsAPIE2ELatencyInMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) + @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) # debug logs to track the payload size podInventorySizeInKB = (podInventory.to_s.length) / 1024 @@ -176,7 +176,7 @@ def enumerate(podList = nil) ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) - ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyInMs", @podsAPIE2ELatencyInMs, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyMs", @podsAPIE2ELatencyMs, telemetryProperties) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr From 26f07723843937f6b47a2a7e3ebae54158726b9a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 14:50:08 -0800 Subject: [PATCH 22/45] nodes inventory telemetry --- source/plugins/ruby/in_kube_nodes.rb | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c77db2791..5caefb23e 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -36,6 +36,8 @@ def initialize @NODES_CHUNK_SIZE = "250" # 0 indicates no batch enabled for stream emit @NODES_EMIT_STREAM_BATCH_SIZE = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 + @nodesAPIE2ELatencyMs = 0 require_relative "constants" end @@ -82,13 +84,18 @@ def enumerate currentTime = Time.now batchTime = currentTime.utc.iso8601 + @nodesAPIE2ELatencyMs = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) # debug logs to track the payload size nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 @@ -100,7 +107,10 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) # debug logs to track the payload size nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 @@ -111,6 +121,7 @@ def enumerate end end + @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) # Setting this to nil so that we dont hold memory until GC kicks in nodeInventory = nil rescue => errorStr @@ -234,7 +245,10 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties = getNodeTelemetryProps(item) properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] capacityInfo = item["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, properties) begin if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] @@ -428,6 +442,8 @@ def getNodeTelemetryProps(item) # using containerRuntimeVersion as DockerVersion as is for non docker runtimes properties["DockerVersion"] = containerRuntimeVersion end + telemetryProperties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE + telemetryProperties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE rescue => errorStr $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end From 79f40f1bea726de0f5e337d98626da817c2e1d1f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 17:06:29 -0800 Subject: [PATCH 23/45] configmap changes --- .../scripts/tomlparser-health-config.rb | 64 +++++++++++++++++-- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-health-config.rb b/build/linux/installer/scripts/tomlparser-health-config.rb index 14c8bdb44..6dab36b7a 100644 --- a/build/linux/installer/scripts/tomlparser-health-config.rb +++ b/build/linux/installer/scripts/tomlparser-health-config.rb @@ -13,22 +13,31 @@ @configMapMountPath = "/etc/config/settings/agent-settings" @configSchemaVersion = "" @enable_health_model = false +@nodesChunkSize = 0 +@podsChunkSize = 0 +@eventsChunkSize = 0 +@deploymentsChunkSize = 0 +@hpaChunkSize = 0 + +def is_number?(value) + true if Integer(value) rescue false +end # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap begin # Check to see if config map is created if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for agent health settings mounted, parsing values" + puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) puts "config::Successfully parsed mounted config map" return parsedConfig else - puts "config::configmap container-azm-ms-agentconfig for agent health settings not mounted, using defaults" + puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" return nil end rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for enabling health: #{errorStr}, using defaults, please check config map for errors") + ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") return nil end end @@ -36,9 +45,39 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used as environment variables def populateSettingValuesFromConfigMap(parsedConfig) begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] puts "enable_health_model = #{@enable_health_model}" + end + chunk_config = parsedConfig[:agent_settings][:chunk_config] + if !chunk_config.nil? + nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) + @nodesChunkSize = nodesChunkSize.to_i + puts "NODES_CHUNK_SIZE = #{@nodesChunkSize}" + end + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] + if !podsChunkSize.nil? && is_number?(podsChunkSize) + @podsChunkSize = podsChunkSize.to_i + puts "PODS_CHUNK_SIZE = #{@podsChunkSize}" + end + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) + @eventsChunkSize = eventsChunkSize.to_i + puts "EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + end + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) + @deploymentsChunkSize = deploymentsChunkSize.to_i + puts "DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + end + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) + @hpaChunkSize = hpaChunkSize.to_i + puts "HPA_CHUNK_SIZE = #{@hpaChunkSize}" + end + end end rescue => errorStr puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" @@ -65,9 +104,24 @@ def populateSettingValuesFromConfigMap(parsedConfig) if !file.nil? file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") + if @nodesChunkSize > 0 + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + end + if @podsChunkSize > 0 + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + end + if @eventsChunkSize > 0 + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + end + if @deploymentsChunkSize > 0 + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + end + if @hpaChunkSize > 0 + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + end # Close file after writing all environment variables file.close else puts "Exception while opening file for writing config environment variables" puts "****************End Config Processing********************" -end \ No newline at end of file +end From 3545773035752e6b39c04a2095f56f3b833c2248 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 19:12:36 -0800 Subject: [PATCH 24/45] add emit streams in configmap --- .../scripts/tomlparser-health-config.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/build/linux/installer/scripts/tomlparser-health-config.rb b/build/linux/installer/scripts/tomlparser-health-config.rb index 6dab36b7a..68496d718 100644 --- a/build/linux/installer/scripts/tomlparser-health-config.rb +++ b/build/linux/installer/scripts/tomlparser-health-config.rb @@ -18,6 +18,8 @@ @eventsChunkSize = 0 @deploymentsChunkSize = 0 @hpaChunkSize = 0 +@podsEmitStreamBatchSize = 0 +@nodesEmitStreamBatchSize = 0 def is_number?(value) true if Integer(value) rescue false @@ -77,6 +79,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) @hpaChunkSize = hpaChunkSize.to_i puts "HPA_CHUNK_SIZE = #{@hpaChunkSize}" end + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) + @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i + puts "PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + end + nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) + @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i + puts "NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + end end end rescue => errorStr @@ -119,6 +131,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) if @hpaChunkSize > 0 file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") end + if @podsEmitStreamBatchSize > 0 + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + end + if @nodesEmitStreamBatchSize > 0 + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + end # Close file after writing all environment variables file.close else From 9b7587dbcb6d6b04494638a0eca8e032979c6c37 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 1 Dec 2020 21:40:04 -0800 Subject: [PATCH 25/45] yaml updates --- kubernetes/omsagent.yaml | 32 ++++++++++----------- source/plugins/ruby/in_kube_nodes.rb | 2 +- source/plugins/ruby/in_kube_podinventory.rb | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 21df718bb..6dc8c2f5c 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-3" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-4" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-3" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-4" imagePullPolicy: IfNotPresent resources: limits: @@ -511,10 +511,10 @@ spec: env: # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these # pod inventory plugin settings - - name: PODS_CHUNK_SIZE - value: "500" - - name: PODS_EMIT_STREAM_BATCH_SIZE - value: "200" + # - name: PODS_CHUNK_SIZE + # value: "500" + # - name: PODS_EMIT_STREAM_BATCH_SIZE + # value: "200" # - name: PODS_EMIT_STREAM # value: "true" # - name: MDM_PODS_INVENTORY_EMIT_STREAM @@ -527,10 +527,10 @@ spec: # value: "true" # node inventory plugin settings - - name: NODES_CHUNK_SIZE - value: "250" - - name: NODES_EMIT_STREAM_BATCH_SIZE - value: "100" + # - name: NODES_CHUNK_SIZE + # value: "250" + # - name: NODES_EMIT_STREAM_BATCH_SIZE + # value: "100" # - name: NODES_EMIT_STREAM # value: "true" # - name: NODES_PERF_EMIT_STREAM @@ -543,20 +543,20 @@ spec: # value: "true" # event inventory plugin settings - - name: EVENTS_CHUNK_SIZE - value: "4000" + # - name: EVENTS_CHUNK_SIZE + # value: "4000" # - name: EVENTS_EMIT_STREAM # value: "true" # kube state deployments - - name: DEPLOYMENTS_CHUNK_SIZE - value: "500" + # - name: DEPLOYMENTS_CHUNK_SIZE + # value: "500" # - name: DEPLOYMENTS_EMIT_STREAM # value: "true" # kube hpa - - name: HPA_CHUNK_SIZE - value: "2000" + # - name: HPA_CHUNK_SIZE + # value: "2000" # - name: HPA_EMIT_STREAM # value: "true" diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 5caefb23e..d1fe43db7 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -35,7 +35,7 @@ def initialize # 250 Node items (15KB per node) account to approximately 4MB @NODES_CHUNK_SIZE = "250" # 0 indicates no batch enabled for stream emit - @NODES_EMIT_STREAM_BATCH_SIZE = 0 + @NODES_EMIT_STREAM_BATCH_SIZE = 100 @nodeInventoryE2EProcessingLatencyMs = 0 @nodesAPIE2ELatencyMs = 0 require_relative "constants" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 2e920e7c4..fa39d0d9c 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -35,7 +35,7 @@ def initialize @winContainerCount = 0 @controllerData = {} # 0 indicates no batch enabled for stream emit - @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 200 @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 end From 9b857b4253027b91ab239fee7d9a8f2be0f6285b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 2 Dec 2020 07:30:20 -0800 Subject: [PATCH 26/45] fix copy and paste bug --- source/plugins/ruby/in_kube_nodes.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index d1fe43db7..d9eb6b09a 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -442,8 +442,8 @@ def getNodeTelemetryProps(item) # using containerRuntimeVersion as DockerVersion as is for non docker runtimes properties["DockerVersion"] = containerRuntimeVersion end - telemetryProperties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE - telemetryProperties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE + properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE + properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE rescue => errorStr $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end From 5597360a0c78412a95fcf311f96262cfdff7c7b4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 2 Dec 2020 13:39:40 -0800 Subject: [PATCH 27/45] add todo comments --- kubernetes/omsagent.yaml | 51 --------------------- source/plugins/ruby/in_kube_nodes.rb | 3 ++ source/plugins/ruby/in_kube_podinventory.rb | 2 + 3 files changed, 5 insertions(+), 51 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 6dc8c2f5c..bc001d9f4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -509,57 +509,6 @@ spec: cpu: 150m memory: 250Mi env: - # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - # pod inventory plugin settings - # - name: PODS_CHUNK_SIZE - # value: "500" - # - name: PODS_EMIT_STREAM_BATCH_SIZE - # value: "200" - # - name: PODS_EMIT_STREAM - # value: "true" - # - name: MDM_PODS_INVENTORY_EMIT_STREAM - # value: "true" - # - name: CONTAINER_PERF_EMIT_STREAM - # value: "true" - # - name: SERVICES_EMIT_STREAM - # value: "true" - # - name: GPU_PERF_EMIT_STREAM - # value: "true" - - # node inventory plugin settings - # - name: NODES_CHUNK_SIZE - # value: "250" - # - name: NODES_EMIT_STREAM_BATCH_SIZE - # value: "100" - # - name: NODES_EMIT_STREAM - # value: "true" - # - name: NODES_PERF_EMIT_STREAM - # value: "true" - # - name: GPU_NODES_PERF_EMIT_STREAM - # value: "true" - # - name: CONTAINER_NODE_INVENTORY_EMIT_STREAM - # value: "true" - # - name: MDM_KUBE_NODE_INVENTORY_EMIT_STREAM - # value: "true" - - # event inventory plugin settings - # - name: EVENTS_CHUNK_SIZE - # value: "4000" - # - name: EVENTS_EMIT_STREAM - # value: "true" - - # kube state deployments - # - name: DEPLOYMENTS_CHUNK_SIZE - # value: "500" - # - name: DEPLOYMENTS_EMIT_STREAM - # value: "true" - - # kube hpa - # - name: HPA_CHUNK_SIZE - # value: "2000" - # - name: HPA_EMIT_STREAM - # value: "true" - - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index d9eb6b09a..5886b523c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -346,6 +346,7 @@ def run_periodic @mutex.unlock end + # TODO - move this method to KubernetesClient or helper class def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) record = {} begin @@ -406,6 +407,7 @@ def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) return record end + # TODO - move this method to KubernetesClient or helper class def getContainerNodeInventoryRecord(item, batchTime = Time.utc.iso8601) containerNodeInventoryRecord = {} begin @@ -426,6 +428,7 @@ def getContainerNodeInventoryRecord(item, batchTime = Time.utc.iso8601) return containerNodeInventoryRecord end + # TODO - move this method to KubernetesClient or helper class def getNodeTelemetryProps(item) properties = {} begin diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index fa39d0d9c..215811353 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -395,6 +395,7 @@ def run_periodic @mutex.unlock end + # TODO - move this method to KubernetesClient or helper class def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) records = [] record = {} @@ -603,6 +604,7 @@ def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) return records end + # TODO - move this method to KubernetesClient or helper class def getServiceNameFromLabels(namespace, labels, serviceRecords) serviceName = "" begin From 8880e91fcd85df14850e6e41d901a909dac80277 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 3 Dec 2020 22:02:52 -0800 Subject: [PATCH 28/45] fix node latency telemetry bug --- source/plugins/ruby/in_kube_nodes.rb | 12 +++++++++--- source/plugins/ruby/in_kube_podinventory.rb | 3 +-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 5886b523c..603c71e97 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -64,7 +64,8 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + @@nodeTelemetryTimeTracker = DateTime.now.to_time. + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -122,6 +123,13 @@ def enumerate end @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) + timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) + ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i + end # Setting this to nil so that we dont hold memory until GC kicks in nodeInventory = nil rescue => errorStr @@ -247,8 +255,6 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) capacityInfo = item["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, properties) begin if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 215811353..7d935b04e 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -91,7 +91,7 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - + podInventoryStartTime = (Time.now.to_f * 1000).to_i # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") @@ -114,7 +114,6 @@ def enumerate(podList = nil) # to track e2e processing latency @podsAPIE2ELatencyMs = 0 - podInventoryStartTime = (Time.now.to_f * 1000).to_i podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil From 87f52d6839f09e7a280c80d47495c27fb06a317e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 4 Dec 2020 05:59:13 -0800 Subject: [PATCH 29/45] update yaml with latest test image --- kubernetes/omsagent.yaml | 4 ++-- source/plugins/ruby/in_kube_nodes.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index bc001d9f4..290490b19 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-4" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-7" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-4" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-7" imagePullPolicy: IfNotPresent resources: limits: diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 603c71e97..92ffcacbe 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -64,7 +64,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time. + @@nodeTelemetryTimeTracker = DateTime.now.to_time @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end end From c4651c94fa590997206827a2751d83d12fdaf99f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 4 Dec 2020 06:40:32 -0800 Subject: [PATCH 30/45] fix bug --- kubernetes/omsagent.yaml | 4 ++-- source/plugins/ruby/in_kube_nodes.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 290490b19..e56efdabb 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-7" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-8" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-7" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-8" imagePullPolicy: IfNotPresent resources: limits: diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 92ffcacbe..9050fa67c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -64,7 +64,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end end From 95144a638d017a3c79efe4cbb161764f29e1357a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 4 Dec 2020 07:40:56 -0800 Subject: [PATCH 31/45] upping rs memory change --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e56efdabb..cede6ca87 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -504,7 +504,7 @@ spec: resources: limits: cpu: 1 - memory: 750Mi + memory: 1Gi requests: cpu: 150m memory: 250Mi From ae2cf42140ac94c0860dedc70f7dfdcc18202ad7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 8 Dec 2020 23:37:08 -0800 Subject: [PATCH 32/45] fix mdm bug with final emit stream --- source/plugins/ruby/in_kube_nodes.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 9050fa67c..f88d80603 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -294,6 +294,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if eventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream eventStream = nil end if containerNodeInventoryEventStream.count > 0 From cf8da5c665a6024f7c31e52a590a7428c52c641b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 9 Dec 2020 10:17:29 -0800 Subject: [PATCH 33/45] update to latest image --- kubernetes/omsagent.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index cede6ca87..b827a72d5 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -340,7 +340,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-8" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-9" imagePullPolicy: IfNotPresent resources: limits: @@ -499,7 +499,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-8" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-9" imagePullPolicy: IfNotPresent resources: limits: From 11eda7c0e5ed0dc426d39c9ca615667a69e11f4c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 9 Dec 2020 12:56:52 -0800 Subject: [PATCH 34/45] fix pr feedback --- .../installer/datafiles/base_container.data | 2 +- .../scripts/tomlparser-agent-config.rb | 145 ++++++++++++++++++ kubernetes/linux/main.sh | 12 +- 3 files changed, 152 insertions(+), 7 deletions(-) create mode 100644 build/linux/installer/scripts/tomlparser-agent-config.rb diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index ca2538b79..562a9d6f2 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -122,7 +122,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root -/opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root +/opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb new file mode 100644 index 000000000..68496d718 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -0,0 +1,145 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" +@enable_health_model = false +@nodesChunkSize = 0 +@podsChunkSize = 0 +@eventsChunkSize = 0 +@deploymentsChunkSize = 0 +@hpaChunkSize = 0 +@podsEmitStreamBatchSize = 0 +@nodesEmitStreamBatchSize = 0 + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + chunk_config = parsedConfig[:agent_settings][:chunk_config] + if !chunk_config.nil? + nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) + @nodesChunkSize = nodesChunkSize.to_i + puts "NODES_CHUNK_SIZE = #{@nodesChunkSize}" + end + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] + if !podsChunkSize.nil? && is_number?(podsChunkSize) + @podsChunkSize = podsChunkSize.to_i + puts "PODS_CHUNK_SIZE = #{@podsChunkSize}" + end + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) + @eventsChunkSize = eventsChunkSize.to_i + puts "EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + end + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) + @deploymentsChunkSize = deploymentsChunkSize.to_i + puts "DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + end + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) + @hpaChunkSize = hpaChunkSize.to_i + puts "HPA_CHUNK_SIZE = #{@hpaChunkSize}" + end + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) + @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i + puts "PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + end + nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) + @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i + puts "NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" + @enable_health_model = false + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("health_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") + if @nodesChunkSize > 0 + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + end + if @podsChunkSize > 0 + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + end + if @eventsChunkSize > 0 + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + end + if @deploymentsChunkSize > 0 + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + end + if @hpaChunkSize > 0 + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + end + if @podsEmitStreamBatchSize > 0 + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + end + if @nodesEmitStreamBatchSize > 0 + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + end + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a2ba6a1d1..63c9a2ba9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -171,8 +171,8 @@ done source config_env_var -#Parse the configmap to set the right environment variables for health feature. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-health-config.rb +#Parse the configmap to set the right environment variables for agent config. +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb cat health_config_env_var | while read line; do #echo $line @@ -429,7 +429,7 @@ echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc #region check to auto-activate oneagent, to route container logs, #Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap -# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map +# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map # AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic echo "************start oneagent log routing checks************" # by default, use configmap route for safer side @@ -462,9 +462,9 @@ else echo "current region is not in oneagent regions..." fi -if [ "$isoneagentregion" = true ]; then +if [ "$isoneagentregion" = true ]; then #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then + if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" else #there is no configmap route, so route thru oneagent @@ -511,7 +511,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "starting mdsd ..." mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - + touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 fi fi From 2f3574d87bf8df8ba55dd96bf426aaefbb37dd17 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 9 Dec 2020 18:03:50 -0800 Subject: [PATCH 35/45] fix pr feedback --- .../scripts/tomlparser-agent-config.rb | 107 ++++++++----- .../scripts/tomlparser-health-config.rb | 145 ------------------ source/plugins/ruby/in_kube_nodes.rb | 1 - source/plugins/ruby/in_kube_podinventory.rb | 3 - 4 files changed, 67 insertions(+), 189 deletions(-) delete mode 100644 build/linux/installer/scripts/tomlparser-health-config.rb diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index 68496d718..841f4b081 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -13,13 +13,47 @@ @configMapMountPath = "/etc/config/settings/agent-settings" @configSchemaVersion = "" @enable_health_model = false -@nodesChunkSize = 0 -@podsChunkSize = 0 -@eventsChunkSize = 0 -@deploymentsChunkSize = 0 -@hpaChunkSize = 0 -@podsEmitStreamBatchSize = 0 -@nodesEmitStreamBatchSize = 0 + +# 250 Node items (15KB per node) account to approximately 4MB +@nodesChunkSize = 250 +# 500 pods (10KB per pod) account to approximately 5MB +@podsChunkSize = 500 +# 4000 events (1KB per event) account to approximately 4MB +@eventsChunkSize = 4000 +# roughly each deployment is 8k +# 500 deployments account to approximately 4MB +@deploymentsChunkSize = 500 +# roughly each HPA is 3k +# 2000 HPAs account to approximately 6-7MB +@hpaChunkSize = 2000 +# stream batch sizes to avoid large file writes +# to low will consume disk i/o +@podsEmitStreamBatchSize = 200 +@nodesEmitStreamBatchSize = 100 + +# higher the chunk size rs pod memory consumption higher and lower api latency +# similarly lower the value, helps on the memory consumption but incurrs additional round trip latency +# these needs to be tuned be based on the workload +# nodes +@nodesChunkSizeMin = 100 +@nodesChunkSizeMax = 400 +# pods +@podsChunkSizeMin = 250 +@podsChunkSizeMax = 1500 +# events +@eventsChunkSizeMin = 2000 +@eventsChunkSizeMax = 10000 +# deployments +@deploymentsChunkSizeMin = 500 +@deploymentsChunkSizeMax = 1000 +# hpa +@hpaChunkSizeMin = 500 +@hpaChunkSizeMax = 1000 + +# emit stream sizes to prevent lower values which costs disk i/o +# max will be upto the chunk size +@podsEmitStreamBatchSizeMin = 50 +@nodesEmitStreamBatchSizeMin = 50 def is_number?(value) true if Integer(value) rescue false @@ -55,37 +89,44 @@ def populateSettingValuesFromConfigMap(parsedConfig) chunk_config = parsedConfig[:agent_settings][:chunk_config] if !chunk_config.nil? nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] - if !nodesChunkSize.nil? && is_number?(nodesChunkSize) + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) && (@nodesChunkSizeMin..@nodesChunkSizeMax) === nodesChunkSize.to_i @nodesChunkSize = nodesChunkSize.to_i - puts "NODES_CHUNK_SIZE = #{@nodesChunkSize}" + puts "Using config map value: NODES_CHUNK_SIZE = #{@nodesChunkSize}" end + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] - if !podsChunkSize.nil? && is_number?(podsChunkSize) + if !podsChunkSize.nil? && is_number?(podsChunkSize) && (@podsChunkSizeMin..@podsChunkSizeMax) === podsChunkSize.to_i @podsChunkSize = podsChunkSize.to_i - puts "PODS_CHUNK_SIZE = #{@podsChunkSize}" + puts "Using config map value: PODS_CHUNK_SIZE = #{@podsChunkSize}" end + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] - if !eventsChunkSize.nil? && is_number?(eventsChunkSize) + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) && (@eventsChunkSizeMin..@eventsChunkSizeMax) === eventsChunkSize.to_i @eventsChunkSize = eventsChunkSize.to_i - puts "EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + puts "Using config map value: EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" end + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] - if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) && (@deploymentsChunkSizeMin..@deploymentsChunkSizeMax) === deploymentsChunkSize.to_i @deploymentsChunkSize = deploymentsChunkSize.to_i - puts "DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + puts "Using config map value: DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" end + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] - if !hpaChunkSize.nil? && is_number?(hpaChunkSize) + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) && (@hpaChunkSizeMin..@hpaChunkSizeMax) === hpaChunkSize.to_i @hpaChunkSize = hpaChunkSize.to_i - puts "HPA_CHUNK_SIZE = #{@hpaChunkSize}" + puts "Using config map value: HPA_CHUNK_SIZE = #{@hpaChunkSize}" end + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] - if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) && + podsEmitStreamBatchSize.to_i <= @podsChunkSize && podsEmitStreamBatchSize.to_i >= @podsEmitStreamBatchSizeMin @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i puts "PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" end nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] - if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) && + nodesEmitStreamBatchSize.to_i <= @nodesChunkSize && nodesEmitStreamBatchSize.to_i >= @nodesEmitStreamBatchSizeMin @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i puts "NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" end @@ -116,27 +157,13 @@ def populateSettingValuesFromConfigMap(parsedConfig) if !file.nil? file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") - if @nodesChunkSize > 0 - file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") - end - if @podsChunkSize > 0 - file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") - end - if @eventsChunkSize > 0 - file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") - end - if @deploymentsChunkSize > 0 - file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") - end - if @hpaChunkSize > 0 - file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") - end - if @podsEmitStreamBatchSize > 0 - file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") - end - if @nodesEmitStreamBatchSize > 0 - file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") - end + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") # Close file after writing all environment variables file.close else diff --git a/build/linux/installer/scripts/tomlparser-health-config.rb b/build/linux/installer/scripts/tomlparser-health-config.rb deleted file mode 100644 index 68496d718..000000000 --- a/build/linux/installer/scripts/tomlparser-health-config.rb +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/local/bin/ruby - -#this should be require relative in Linux and require in windows, since it is a gem install on windows -@os_type = ENV["OS_TYPE"] -if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 - require "tomlrb" -else - require_relative "tomlrb" -end - -require_relative "ConfigParseErrorLogger" - -@configMapMountPath = "/etc/config/settings/agent-settings" -@configSchemaVersion = "" -@enable_health_model = false -@nodesChunkSize = 0 -@podsChunkSize = 0 -@eventsChunkSize = 0 -@deploymentsChunkSize = 0 -@hpaChunkSize = 0 -@podsEmitStreamBatchSize = 0 -@nodesEmitStreamBatchSize = 0 - -def is_number?(value) - true if Integer(value) rescue false -end - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" - parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? - if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? - @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] - puts "enable_health_model = #{@enable_health_model}" - end - chunk_config = parsedConfig[:agent_settings][:chunk_config] - if !chunk_config.nil? - nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] - if !nodesChunkSize.nil? && is_number?(nodesChunkSize) - @nodesChunkSize = nodesChunkSize.to_i - puts "NODES_CHUNK_SIZE = #{@nodesChunkSize}" - end - podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] - if !podsChunkSize.nil? && is_number?(podsChunkSize) - @podsChunkSize = podsChunkSize.to_i - puts "PODS_CHUNK_SIZE = #{@podsChunkSize}" - end - eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] - if !eventsChunkSize.nil? && is_number?(eventsChunkSize) - @eventsChunkSize = eventsChunkSize.to_i - puts "EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" - end - deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] - if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) - @deploymentsChunkSize = deploymentsChunkSize.to_i - puts "DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" - end - hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] - if !hpaChunkSize.nil? && is_number?(hpaChunkSize) - @hpaChunkSize = hpaChunkSize.to_i - puts "HPA_CHUNK_SIZE = #{@hpaChunkSize}" - end - podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] - if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) - @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i - puts "PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" - end - nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] - if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) - @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i - puts "NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" - end - end - end - rescue => errorStr - puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" - @enable_health_model = false - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@configMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") - end - @enable_health_model = false -end - -# Write the settings to file, so that they can be set as environment variables -file = File.open("health_config_env_var", "w") - -if !file.nil? - file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") - if @nodesChunkSize > 0 - file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") - end - if @podsChunkSize > 0 - file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") - end - if @eventsChunkSize > 0 - file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") - end - if @deploymentsChunkSize > 0 - file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") - end - if @hpaChunkSize > 0 - file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") - end - if @podsEmitStreamBatchSize > 0 - file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") - end - if @nodesEmitStreamBatchSize > 0 - file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") - end - # Close file after writing all environment variables - file.close -else - puts "Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" -end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index f88d80603..04d71276d 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -34,7 +34,6 @@ def initialize require_relative "omslog" # 250 Node items (15KB per node) account to approximately 4MB @NODES_CHUNK_SIZE = "250" - # 0 indicates no batch enabled for stream emit @NODES_EMIT_STREAM_BATCH_SIZE = 100 @nodeInventoryE2EProcessingLatencyMs = 0 @nodesAPIE2ELatencyMs = 0 diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 7d935b04e..b5f6c4116 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -27,14 +27,12 @@ def initialize require_relative "omslog" require_relative "constants" - # 500 pod (10KB per pod) account to approximately 5MB @PODS_CHUNK_SIZE = "500" @podCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} - # 0 indicates no batch enabled for stream emit @PODS_EMIT_STREAM_BATCH_SIZE = 200 @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 @@ -59,7 +57,6 @@ def start @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i end $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") - @finished = false @condition = ConditionVariable.new @mutex = Mutex.new From 6b589a9539be9a08176ee617158dc5cd68df80f3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 12 Dec 2020 18:50:12 -0800 Subject: [PATCH 36/45] rename health config to agent config --- build/linux/installer/scripts/tomlparser-agent-config.rb | 8 ++++---- kubernetes/linux/main.sh | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index 841f4b081..3369201fc 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -122,18 +122,18 @@ def populateSettingValuesFromConfigMap(parsedConfig) if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) && podsEmitStreamBatchSize.to_i <= @podsChunkSize && podsEmitStreamBatchSize.to_i >= @podsEmitStreamBatchSizeMin @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i - puts "PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + puts "Using config map value: PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" end nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) && nodesEmitStreamBatchSize.to_i <= @nodesChunkSize && nodesEmitStreamBatchSize.to_i >= @nodesEmitStreamBatchSizeMin @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i - puts "NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + puts "Using config map value: NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" end end end rescue => errorStr - puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" + puts "config::error:Exception while reading config settings for agent configuration setting - #{errorStr}, using defaults" @enable_health_model = false end end @@ -153,7 +153,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) end # Write the settings to file, so that they can be set as environment variables -file = File.open("health_config_env_var", "w") +file = File.open("agent_config_env_var", "w") if !file.nil? file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 63c9a2ba9..ed16d3e32 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -174,11 +174,11 @@ source config_env_var #Parse the configmap to set the right environment variables for agent config. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb -cat health_config_env_var | while read line; do +cat agent_config_env_var | while read line; do #echo $line echo $line >> ~/.bashrc done -source health_config_env_var +source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb From 53972c2831caf84f64ae729c7f0f6ee34fe32f72 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 13 Dec 2020 08:37:34 -0800 Subject: [PATCH 37/45] fix max allowed hpa chunk size --- build/linux/installer/scripts/tomlparser-agent-config.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index 3369201fc..d37382145 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -48,7 +48,7 @@ @deploymentsChunkSizeMax = 1000 # hpa @hpaChunkSizeMin = 500 -@hpaChunkSizeMax = 1000 +@hpaChunkSizeMax = 2000 # emit stream sizes to prevent lower values which costs disk i/o # max will be upto the chunk size From f8702ffc50829eafa82de70971109ea2715fc80d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 14 Dec 2020 08:45:56 -0800 Subject: [PATCH 38/45] update to use 1k pod chunk since validated on 1.18+ --- build/linux/installer/scripts/tomlparser-agent-config.rb | 4 ++-- source/plugins/ruby/in_kube_podinventory.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index d37382145..1c7e243b9 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -16,8 +16,8 @@ # 250 Node items (15KB per node) account to approximately 4MB @nodesChunkSize = 250 -# 500 pods (10KB per pod) account to approximately 5MB -@podsChunkSize = 500 +# 1000 pods (10KB per pod) account to approximately 10MB +@podsChunkSize = 1000 # 4000 events (1KB per event) account to approximately 4MB @eventsChunkSize = 4000 # roughly each deployment is 8k diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index b5f6c4116..5314cd92a 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -27,7 +27,7 @@ def initialize require_relative "omslog" require_relative "constants" - @PODS_CHUNK_SIZE = "500" + @PODS_CHUNK_SIZE = "1000" @podCount = 0 @serviceCount = 0 @controllerSet = Set.new [] From 531f76834c6dcc9d4685945f54a565ab752afd1d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 14 Dec 2020 15:50:50 -0800 Subject: [PATCH 39/45] remove debug logs --- .../installer/scripts/tomlparser-agent-config.rb | 2 +- source/plugins/ruby/in_kube_events.rb | 8 ++------ source/plugins/ruby/in_kube_nodes.rb | 8 ++------ source/plugins/ruby/in_kube_podinventory.rb | 14 ++++---------- source/plugins/ruby/in_kubestate_deployments.rb | 8 ++------ 5 files changed, 11 insertions(+), 29 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index 1c7e243b9..87c5194ed 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -27,7 +27,7 @@ # 2000 HPAs account to approximately 6-7MB @hpaChunkSize = 2000 # stream batch sizes to avoid large file writes -# to low will consume disk i/o +# too low will consume higher disk iops @podsEmitStreamBatchSize = 200 @nodesEmitStreamBatchSize = 100 diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6cea5e996..98d1bddbe 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -87,10 +87,8 @@ def enumerate end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) - # debug logs to track the payload size eventsCount = eventList["items"].length - eventsInventorySizeInKB = (eventList.to_s.length) / 1024 - $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} and size in KB #{eventsInventorySizeInKB} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -100,10 +98,8 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) - # debug logs to track the payload size eventsCount = eventList["items"].length - eventsInventorySizeInKB = (eventList.to_s.length) / 1024 - $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} and size in KB #{eventsInventorySizeInKB} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 04d71276d..5a8ed9c71 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -97,9 +97,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - # debug logs to track the payload size - nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} and size in KB: #{nodeInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" @@ -112,9 +110,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) - # debug logs to track the payload size - nodeInventorySizeInKB = (nodeInventory.to_s.length) / 1024 - $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} and size in KB: #{nodeInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5314cd92a..d6e6739f0 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -96,11 +96,9 @@ def enumerate(podList = nil) $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") if !serviceInfo.nil? - # debug logs to track the payload size - serviceInfoResponseSizeInKB = (serviceInfo.body.length) / 1024 - $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl serviceInfo size in KB #{serviceInfoResponseSizeInKB} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl serviceInfo size in KB #{serviceInfoResponseSizeInKB} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInfo = nil # service inventory records much smaller and fixed size compared to serviceList serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) @@ -120,9 +118,7 @@ def enumerate(podList = nil) podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - # debug logs to track the payload size - podInventorySizeInKB = (podInventory.to_s.length) / 1024 - $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} and size in KB: #{podInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" @@ -135,9 +131,7 @@ def enumerate(podList = nil) podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - # debug logs to track the payload size - podInventorySizeInKB = (podInventory.to_s.length) / 1024 - $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} and size in KB: #{podInventorySizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index ffbe8ae4f..48ad87041 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -80,9 +80,7 @@ def enumerate continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) - # debug logs to track the payload size - deploymentsSizeInKB = (deploymentList.to_s.length) / 1024 - $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} and size in KB: #{deploymentsSizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" @@ -92,9 +90,7 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) - # debug logs to track the payload size - deploymentsSizeInKB = (deploymentList.to_s.length) / 1024 - $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} and size in KB: #{deploymentsSizeInKB} from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" From cff2ee4c497323f65a75c2c20e6f3be672ce2141 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 14 Dec 2020 18:15:30 -0800 Subject: [PATCH 40/45] minor updates --- source/plugins/ruby/KubernetesApiClient.rb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index eb2aa3425..aca2142a0 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -596,7 +596,7 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!node["status"][metricCategory].nil?) + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) @@ -794,15 +794,13 @@ def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) if (!serviceList.nil? && !serviceList.empty?) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") - servicesSizeInKB = (serviceList["items"].to_s.length) / 1024 - @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : size of serviceList in KB #{servicesSizeInKB} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| kubeServiceRecord = {} kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated kubeServiceRecord["ServiceName"] = item["metadata"]["name"] kubeServiceRecord["Namespace"] = item["metadata"]["namespace"] kubeServiceRecord["SelectorLabels"] = [item["spec"]["selector"]] - # add these before emit to avoid memory foot print + # added these before emit to avoid memory foot print # kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId # kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName kubeServiceRecord["ClusterIP"] = item["spec"]["clusterIP"] From 60d63911f15902a8881319330dcc9ece940bb774 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 13:28:08 -0800 Subject: [PATCH 41/45] move defaults to common place --- source/plugins/ruby/in_kube_events.rb | 13 +++++++---- source/plugins/ruby/in_kube_nodes.rb | 21 +++++++++++++----- source/plugins/ruby/in_kube_podinventory.rb | 22 ++++++++++++++----- .../plugins/ruby/in_kubestate_deployments.rb | 15 ++++++++----- source/plugins/ruby/in_kubestate_hpa.rb | 15 ++++++++----- 5 files changed, 61 insertions(+), 25 deletions(-) diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 98d1bddbe..ff2b73dd6 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -17,8 +17,9 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" - # 4000 events (1KB per event) account to approximately 4MB - @EVENTS_CHUNK_SIZE = 4000 + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @EVENTS_CHUNK_SIZE = 0 # Initializing events count for telemetry @eventsCount = 0 @@ -36,8 +37,12 @@ def configure(conf) def start if @run_interval - if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? - @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"] + if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 + @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen and setting default just safe gauard + $log.warn("in_kube_events::start: setting to default value since got EVENTS_CHUNK_SIZE nil or empty") + @EVENTS_CHUNK_SIZE = 4000 end $log.info("in_kube_events::start : EVENTS_CHUNK_SIZE @ #{@EVENTS_CHUNK_SIZE}") diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 5a8ed9c71..ce060d9bb 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -32,9 +32,10 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" - # 250 Node items (15KB per node) account to approximately 4MB - @NODES_CHUNK_SIZE = "250" - @NODES_EMIT_STREAM_BATCH_SIZE = 100 + # refer tomlparser-agent-config for the defaults + @NODES_CHUNK_SIZE = 0 + @NODES_EMIT_STREAM_BATCH_SIZE = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 @nodesAPIE2ELatencyMs = 0 require_relative "constants" @@ -49,13 +50,21 @@ def configure(conf) def start if @run_interval - if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? - @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"] + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen adding safe gauard + $log.warn("in_kube_nodes::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 end $log.info("in_kube_nodes::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") - if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? + if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen and setting default just safe gauard in case + $log.warn("in_kube_nodes::start: setting to default value since got NODES_EMIT_STREAM_BATCH_SIZE nil or empty") + @NODES_EMIT_STREAM_BATCH_SIZE = 100 end $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index d6e6739f0..ccd763bea 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -27,13 +27,16 @@ def initialize require_relative "omslog" require_relative "constants" - @PODS_CHUNK_SIZE = "1000" + # refer tomlparser-agent-config for updating defaults + # this configurable via configmap + @PODS_CHUNK_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @podCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} - @PODS_EMIT_STREAM_BATCH_SIZE = 200 @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 end @@ -48,15 +51,24 @@ def configure(conf) def start if @run_interval - if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? - @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"] + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i + else + # this shouldnt happen and setting default as safe gauard in case + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") + @PODS_CHUNK_SIZE = 1000 end $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") - if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen and setting default as safe gauard in case + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") + @PODS_EMIT_STREAM_BATCH_SIZE = 200 end $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 48ad87041..15e554f06 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -21,9 +21,10 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "constants" - # roughly each deployment is 8k - # 500 deployments account to approximately 4MB - @DEPLOYMENTS_CHUNK_SIZE = 500 + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @DEPLOYMENTS_CHUNK_SIZE = 0 + @DEPLOYMENTS_API_GROUP = "apps" @@telemetryLastSentTime = DateTime.now.to_time.to_i @@ -43,8 +44,12 @@ def configure(conf) def start if @run_interval - if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? - @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"] + if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 + @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen and setting default as safe gauard in case + $log.warn("in_kubestate_deployments::start: setting to default value since got DEPLOYMENTS_CHUNK_SIZE nil or empty") + @DEPLOYMENTS_CHUNK_SIZE = 500 end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 736f17250..ea6f851bf 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -18,9 +18,10 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "constants" - # roughly each HPA is 3k - # 2000 HPAs account to approximately 6-7MB - @HPA_CHUNK_SIZE = 2000 + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @HPA_CHUNK_SIZE = 0 + @HPA_API_GROUP = "autoscaling" # telemetry @@ -40,8 +41,12 @@ def configure(conf) def start if @run_interval - if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? - @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"] + if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 + @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i + else + # this shouldnt happen and setting default as safe gauard in case + $log.warn("in_kubestate_hpa::start: setting to default value since got HPA_CHUNK_SIZE nil or empty") + @HPA_CHUNK_SIZE = 2000 end $log.info("in_kubestate_hpa::start : HPA_CHUNK_SIZE @ #{@HPA_CHUNK_SIZE}") From f88ae920f81fe4d8d6d801fcd981c681b495c86a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 15:10:25 -0800 Subject: [PATCH 42/45] chart updates --- .../templates/omsagent-rs-configmap.yaml | 32 +++++++++---------- charts/azuremonitor-containers/values.yaml | 9 ++++++ 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index baeedf1be..fc7c471f8 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -95,7 +95,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -108,24 +108,24 @@ data: - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer @@ -155,7 +155,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -184,7 +184,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index e8acda20e..907e315d1 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -81,6 +81,15 @@ omsagent: deployment: affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: From 0392e28b774e3e7c115fa1226702fdfb3df61738 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 15:11:58 -0800 Subject: [PATCH 43/45] final oomfix agent --- kubernetes/omsagent.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 43591c365..f9515bb66 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -364,7 +364,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-9" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomfix12152020" imagePullPolicy: IfNotPresent resources: limits: @@ -523,7 +523,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomtest11282020-9" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomfix12152020" imagePullPolicy: IfNotPresent resources: limits: From 6be2e13abaf0e8443c9826fd6a7c7bc671dc9cb8 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 15:35:20 -0800 Subject: [PATCH 44/45] update to use prod image so that can be validated with build pipeline --- kubernetes/omsagent.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index f9515bb66..013e2a6c0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -364,7 +364,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomfix12152020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" imagePullPolicy: IfNotPresent resources: limits: @@ -523,7 +523,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:cioomfix12152020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" imagePullPolicy: IfNotPresent resources: limits: From 1c25829041f75d7640271965eaa711ba889cffb1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 15:48:26 -0800 Subject: [PATCH 45/45] fix typo in comment --- source/plugins/ruby/in_kube_events.rb | 2 +- source/plugins/ruby/in_kube_nodes.rb | 4 ++-- source/plugins/ruby/in_kube_podinventory.rb | 4 ++-- source/plugins/ruby/in_kubestate_deployments.rb | 2 +- source/plugins/ruby/in_kubestate_hpa.rb | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index ff2b73dd6..4f6017cc5 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -40,7 +40,7 @@ def start if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i else - # this shouldnt happen and setting default just safe gauard + # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_events::start: setting to default value since got EVENTS_CHUNK_SIZE nil or empty") @EVENTS_CHUNK_SIZE = 4000 end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index ce060d9bb..e7c5060a5 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -53,7 +53,7 @@ def start if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i else - # this shouldnt happen adding safe gauard + # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_nodes::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") @NODES_CHUNK_SIZE = 250 end @@ -62,7 +62,7 @@ def start if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i else - # this shouldnt happen and setting default just safe gauard in case + # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_nodes::start: setting to default value since got NODES_EMIT_STREAM_BATCH_SIZE nil or empty") @NODES_EMIT_STREAM_BATCH_SIZE = 100 end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index ccd763bea..0cff2eefe 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -54,7 +54,7 @@ def start if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i else - # this shouldnt happen and setting default as safe gauard in case + # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") @PODS_CHUNK_SIZE = 1000 end @@ -63,7 +63,7 @@ def start if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i else - # this shouldnt happen and setting default as safe gauard in case + # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") @PODS_EMIT_STREAM_BATCH_SIZE = 200 end diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 15e554f06..27e4709a2 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -47,7 +47,7 @@ def start if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i else - # this shouldnt happen and setting default as safe gauard in case + # this shouldnt happen just setting default here as safe guard $log.warn("in_kubestate_deployments::start: setting to default value since got DEPLOYMENTS_CHUNK_SIZE nil or empty") @DEPLOYMENTS_CHUNK_SIZE = 500 end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index ea6f851bf..afecf8e3b 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -44,7 +44,7 @@ def start if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i else - # this shouldnt happen and setting default as safe gauard in case + # this shouldnt happen just setting default here as safe guard $log.warn("in_kubestate_hpa::start: setting to default value since got HPA_CHUNK_SIZE nil or empty") @HPA_CHUNK_SIZE = 2000 end