From 99c22b114a784dd87cafad9d6b2c3bb3c35dd6ab Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 7 Mar 2019 22:39:33 -0800 Subject: [PATCH 01/38] changes for win nodes enumeration --- installer/conf/kube.conf | 23 + source/code/plugin/KubernetesApiClient.rb | 926 +++++++++++---------- source/code/plugin/in_win_cadvisor_perf.rb | 89 ++ 3 files changed, 590 insertions(+), 448 deletions(-) create mode 100644 source/code/plugin/in_win_cadvisor_perf.rb diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 454df6e91..8a82af04b 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -47,6 +47,14 @@ log_level debug +#cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.containerinsights.wincadvisorperf + run_interval 60s + log_level debug + + type filter_inventory2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope @@ -168,3 +176,18 @@ max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + \ No newline at end of file diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index a1e143b15..648084002 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -2,474 +2,504 @@ # frozen_string_literal: true class KubernetesApiClient + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "time" - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'time' - - require_relative 'oms_common' - - @@ApiVersion = "v1" - @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - @@ClusterName = nil - @@ClusterId = nil - @@IsNodeMaster = nil - #@@IsValidRunningNode = nil - #@@IsLinuxCluster = nil - @@KubeSystemNamespace = "kube-system" - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" - @@TokenStr = nil - @@NodeMetrics = Hash.new - - def initialize + require_relative "oms_common" + + @@ApiVersion = "v1" + @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@ClusterName = nil + @@ClusterId = nil + @@IsNodeMaster = nil + #@@IsValidRunningNode = nil + #@@IsLinuxCluster = nil + @@KubeSystemNamespace = "kube-system" + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@TokenStr = nil + @@NodeMetrics = Hash.new + + def initialize + end + + class << self + def getKubeResourceInfo(resource) + headers = {} + response = nil + @Log.info "Getting Kube resource" + @Log.info resource + begin + resourceUri = getResourceUri(resource) + if !resourceUri.nil? + uri = URI.parse(resourceUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + else + http.ca_file = @@CaFile if File.exist?(@@CaFile) + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end + rescue => error + @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") + end + if (response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end + return response + end - class << self - def getKubeResourceInfo(resource) - headers = {} - response = nil - @Log.info 'Getting Kube resource' - @Log.info resource - begin - resourceUri = getResourceUri(resource) - if !resourceUri.nil? - uri = URI.parse(resourceUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - if !File.exist?(@@CaFile) - raise "#{@@CaFile} doesnt exist" - else - http.ca_file = @@CaFile if File.exist?(@@CaFile) - end - http.verify_mode = OpenSSL::SSL::VERIFY_PEER - - kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) - kubeApiRequest['Authorization'] = "Bearer " + getTokenStr - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" - response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" - end - rescue => error - @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") - end - if (response.body.empty?) - @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") - end - return response - end + def getTokenStr + return @@TokenStr if !@@TokenStr.nil? + begin + if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) + @@TokenStr = File.read(@@TokenFileName).strip + return @@TokenStr + else + @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") + return nil + end + end + end - def getTokenStr - return @@TokenStr if !@@TokenStr.nil? - begin - if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) - @@TokenStr = File.read(@@TokenFileName).strip - return @@TokenStr - else - @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") - return nil - end - end - end + def getResourceUri(resource) + begin + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + else + @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + return nil + end + end + end - def getResourceUri(resource) - begin - if ENV['KUBERNETES_SERVICE_HOST'] && ENV['KUBERNETES_PORT_443_TCP_PORT'] - return "https://#{ENV['KUBERNETES_SERVICE_HOST']}:#{ENV['KUBERNETES_PORT_443_TCP_PORT']}/api/" + @@ApiVersion + "/" + resource - else - @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV['KUBERNETES_SERVICE_HOST']} KUBERNETES_PORT_443_TCP_PORT: #{ENV['KUBERNETES_PORT_443_TCP_PORT']}. Unable to form resourceUri") - return nil - end + def getClusterName + return @@ClusterName if !@@ClusterName.nil? + @@ClusterName = "None" + begin + #try getting resource ID for aks + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster.split("/").last + else + cluster = ENV["ACS_RESOURCE_NAME"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster + else + kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" + @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |items| + if items["metadata"]["name"].include? "kube-controller-manager" + items["spec"]["containers"][0]["command"].each do |command| + if command.include? "--cluster-name" + @@ClusterName = command.split("=")[1] + end end + end end + end + end + rescue => error + @Log.warn("getClusterName failed: #{error}") + end + return @@ClusterName + end - def getClusterName - return @@ClusterName if !@@ClusterName.nil? - @@ClusterName = "None" - begin - #try getting resource ID for aks - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster.split("/").last - else - cluster = ENV['ACS_RESOURCE_NAME'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster - else - kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" - @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |items| - if items['metadata']['name'].include? "kube-controller-manager" - items['spec']['containers'][0]['command'].each do |command| - if command.include? "--cluster-name" - @@ClusterName = command.split('=')[1] - end - end - end - end - end - end - rescue => error - @Log.warn("getClusterName failed: #{error}") - end - return @@ClusterName - end + def getClusterId + return @@ClusterId if !@@ClusterId.nil? + #By default initialize ClusterId to ClusterName. + # In ACS/On-prem, we need to figure out how we can generate ClusterId + @@ClusterId = getClusterName + begin + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterId = cluster + end + rescue => error + @Log.warn("getClusterId failed: #{error}") + end + return @@ClusterId + end - def getClusterId - return @@ClusterId if !@@ClusterId.nil? - #By default initialize ClusterId to ClusterName. - # In ACS/On-prem, we need to figure out how we can generate ClusterId - @@ClusterId = getClusterName - begin - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterId = cluster - end - rescue => error - @Log.warn("getClusterId failed: #{error}") - end - return @@ClusterId + def isNodeMaster + return @@IsNodeMaster if !@@IsNodeMaster.nil? + @@IsNodeMaster = false + begin + @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + allNodesInfo = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if !allNodesInfo.nil? && !allNodesInfo.empty? + thisNodeName = OMS::Common.get_hostname + allNodesInfo["items"].each do |item| + if item["metadata"]["name"].casecmp(thisNodeName) == 0 + if item["metadata"]["labels"]["kubernetes.io/role"].to_s.include?("master") || item["metadata"]["labels"]["role"].to_s.include?("master") + @@IsNodeMaster = true + end + break end + end + end + rescue => error + @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") + end - def isNodeMaster - return @@IsNodeMaster if !@@IsNodeMaster.nil? - @@IsNodeMaster = false - begin - @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if !allNodesInfo.nil? && !allNodesInfo.empty? - thisNodeName = OMS::Common.get_hostname - allNodesInfo['items'].each do |item| - if item['metadata']['name'].casecmp(thisNodeName) == 0 - if item['metadata']['labels']["kubernetes.io/role"].to_s.include?("master") || item['metadata']['labels']["role"].to_s.include?("master") - @@IsNodeMaster = true - end - break - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") - end - - return @@IsNodeMaster - end + return @@IsNodeMaster + end - #def isValidRunningNode - # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? - # @@IsValidRunningNode = false - # begin - # thisNodeName = OMS::Common.get_hostname - # if isLinuxCluster - # # Run on agent node [0] - # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # else - # # Run on master node [0] - # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # end - # rescue => error - # @Log.warn("Checking Node Type failed: #{error}") - # end - # if(@@IsValidRunningNode == true) - # @Log.info("Electing current node to talk to k8 api") - # else - # @Log.info("Not Electing current node to talk to k8 api") - # end - # return @@IsValidRunningNode - #end - - #def isLinuxCluster - # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? - # @@IsLinuxCluster = true - # begin - # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # if !allNodesInfo.nil? && !allNodesInfo.empty? - # allNodesInfo['items'].each do |item| - # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) - # @@IsLinuxCluster = false - # break - # end - # end - # end - # rescue => error - # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") - # end - # return @@IsLinuxCluster - #end - - # returns an arry of pods (json) - def getPods(namespace) - pods = [] - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - podInfo['items'].each do |items| - pods.push items - end - rescue => error - @Log.warn("List pods request failed: #{error}") - end - return pods - end + #def isValidRunningNode + # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? + # @@IsValidRunningNode = false + # begin + # thisNodeName = OMS::Common.get_hostname + # if isLinuxCluster + # # Run on agent node [0] + # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # else + # # Run on master node [0] + # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # end + # rescue => error + # @Log.warn("Checking Node Type failed: #{error}") + # end + # if(@@IsValidRunningNode == true) + # @Log.info("Electing current node to talk to k8 api") + # else + # @Log.info("Not Electing current node to talk to k8 api") + # end + # return @@IsValidRunningNode + #end + + #def isLinuxCluster + # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? + # @@IsLinuxCluster = true + # begin + # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) + # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # if !allNodesInfo.nil? && !allNodesInfo.empty? + # allNodesInfo['items'].each do |item| + # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) + # @@IsLinuxCluster = false + # break + # end + # end + # end + # rescue => error + # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") + # end + # return @@IsLinuxCluster + #end + + # returns an arry of pods (json) + def getPods(namespace) + pods = [] + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + podInfo["items"].each do |items| + pods.push items + end + rescue => error + @Log.warn("List pods request failed: #{error}") + end + return pods + end - def getContainerIDs(namespace) - containers = Hash.new - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |item| - if (!item['status'].nil? && !item['status'].empty? && !item['status']['containerStatuses'].nil? && !item['status']['containerStatuses'].empty?) - item['status']['containerStatuses'].each do |cntr| - containers[cntr['containerID']] = "kube-system" - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + # returns a hash of windows node names and their internal IPs + def getWindowsNodes + winNodes = [] + begin + # kubesystemResourceUri = "namespaces/" + namespace + "/pods" + # podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + winNode = {} + nodeStatus = item["status"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a? String && operatingSystem.casecmp("windows") == 0) + nodeStatusAddresses = nodeStatus["addresses"] + if !nodeStatusAddresses.nil? + nodeStatusAddresses.each do |address| + winNode[address["type"]] = address[address] + end + winNodes.push(winNode) end - return containers + end end + end + end + return winNodes + rescue => error + @Log.warn("Error in get windows nodes: #{error}") + end + return pods + end - def getContainerLogs(namespace, pod, container, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") - end - return containerLogs + def getContainerIDs(namespace) + containers = Hash.new + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |item| + if (!item["status"].nil? && !item["status"].empty? && !item["status"]["containerStatuses"].nil? && !item["status"]["containerStatuses"].empty?) + item["status"]["containerStatuses"].each do |cntr| + containers[cntr["containerID"]] = "kube-system" end + end + end + rescue => error + @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + end + return containers + end + + def getContainerLogs(namespace, pod, container, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end + + def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since + kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date + + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("calling #{kubesystemResourceUri}") + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end + + def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + clusterId = getClusterId + metricInfo = metricJSON + metricInfo["items"].each do |pod| + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + else + podUid = pod["metadata"]["uid"] + end + if (!pod["spec"]["containers"].nil? && !pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + pod["spec"]["containers"].each do |container| + containerName = container["name"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since - kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date - - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("calling #{kubesystemResourceUri}") - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end - return containerLogs + end end + end + end + rescue => error + @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + return metricItems + end + return metricItems + end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - clusterId = getClusterId - metricInfo = metricJSON - metricInfo['items'].each do |pod| - podNameSpace = pod['metadata']['namespace'] - if podNameSpace.eql?("kube-system") && !pod['metadata'].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = pod['metadata']['annotations']['kubernetes.io/config.hash'] - else - podUid = pod['metadata']['uid'] - end - if (!pod['spec']['containers'].nil? && !pod['spec']['nodeName'].nil?) - nodeName = pod['spec']['nodeName'] - pod['spec']['containers'].each do |container| - containerName = container['name'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container['resources'].nil? && !container['resources'].empty? && !container['resources'][metricCategory].nil? && !container['resources'][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container['resources'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - end - end - rescue => error - @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - return metricItems - end - return metricItems - end #getContainerResourceRequestAndLimits - - def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - metricInfo = metricJSON - clusterId = getClusterId - #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, - #if we are coming up with the time it should be same for all nodes - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo['items'].each do |node| - if (!node['status'][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node['status'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = node['metadata']['name'] - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + node['metadata']['name'] - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node['metadata']['name'] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end - end - rescue => error - @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - end - return metricItems - end #parseNodeLimits - - def getMetricNumericValue(metricName, metricVal) - metricValue = metricVal - begin - case metricName - when "memory" #convert to bytes for memory - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ - if (metricValue.end_with?("Ki")) - metricValue.chomp!("Ki") - metricValue = Float(metricValue) * 1024.0 ** 1 - elsif (metricValue.end_with?("Mi")) - metricValue.chomp!("Mi") - metricValue = Float(metricValue) * 1024.0 ** 2 - elsif (metricValue.end_with?("Gi")) - metricValue.chomp!("Gi") - metricValue = Float(metricValue) * 1024.0 ** 3 - elsif (metricValue.end_with?("Ti")) - metricValue.chomp!("Ti") - metricValue = Float(metricValue) * 1024.0 ** 4 - elsif (metricValue.end_with?("Pi")) - metricValue.chomp!("Pi") - metricValue = Float(metricValue) * 1024.0 ** 5 - elsif (metricValue.end_with?("Ei")) - metricValue.chomp!("Ei") - metricValue = Float(metricValue) * 1024.0 ** 6 - elsif (metricValue.end_with?("Zi")) - metricValue.chomp!("Zi") - metricValue = Float(metricValue) * 1024.0 ** 7 - elsif (metricValue.end_with?("Yi")) - metricValue.chomp!("Yi") - metricValue = Float(metricValue) * 1024.0 ** 8 - elsif (metricValue.end_with?("K")) - metricValue.chomp!("K") - metricValue = Float(metricValue) * 1000.0 ** 1 - elsif (metricValue.end_with?("M")) - metricValue.chomp!("M") - metricValue = Float(metricValue) * 1000.0 ** 2 - elsif (metricValue.end_with?("G")) - metricValue.chomp!("G") - metricValue = Float(metricValue) * 1000.0 ** 3 - elsif (metricValue.end_with?("T")) - metricValue.chomp!("T") - metricValue = Float(metricValue) * 1000.0 ** 4 - elsif (metricValue.end_with?("P")) - metricValue.chomp!("P") - metricValue = Float(metricValue) * 1000.0 ** 5 - elsif (metricValue.end_with?("E")) - metricValue.chomp!("E") - metricValue = Float(metricValue) * 1000.0 ** 6 - elsif (metricValue.end_with?("Z")) - metricValue.chomp!("Z") - metricValue = Float(metricValue) * 1000.0 ** 7 - elsif (metricValue.end_with?("Y")) - metricValue.chomp!("Y") - metricValue = Float(metricValue) * 1000.0 ** 8 - else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) - end - when "cpu" #convert to nanocores for cpu - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ - if (metricValue.end_with?("m")) - metricValue.chomp!("m") - metricValue = Float(metricValue) * 1000.0 ** 2 - else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) * 1000.0 ** 3 - end - else - @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") - metricValue = 0 - end #case statement - rescue => error - @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") - return 0 - end - return metricValue - end # getMetricNumericValue + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end end - end + rescue => error + @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end #parseNodeLimits + def getMetricNumericValue(metricName, metricVal) + metricValue = metricVal + begin + case metricName + when "memory" #convert to bytes for memory + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ + if (metricValue.end_with?("Ki")) + metricValue.chomp!("Ki") + metricValue = Float(metricValue) * 1024.0 ** 1 + elsif (metricValue.end_with?("Mi")) + metricValue.chomp!("Mi") + metricValue = Float(metricValue) * 1024.0 ** 2 + elsif (metricValue.end_with?("Gi")) + metricValue.chomp!("Gi") + metricValue = Float(metricValue) * 1024.0 ** 3 + elsif (metricValue.end_with?("Ti")) + metricValue.chomp!("Ti") + metricValue = Float(metricValue) * 1024.0 ** 4 + elsif (metricValue.end_with?("Pi")) + metricValue.chomp!("Pi") + metricValue = Float(metricValue) * 1024.0 ** 5 + elsif (metricValue.end_with?("Ei")) + metricValue.chomp!("Ei") + metricValue = Float(metricValue) * 1024.0 ** 6 + elsif (metricValue.end_with?("Zi")) + metricValue.chomp!("Zi") + metricValue = Float(metricValue) * 1024.0 ** 7 + elsif (metricValue.end_with?("Yi")) + metricValue.chomp!("Yi") + metricValue = Float(metricValue) * 1024.0 ** 8 + elsif (metricValue.end_with?("K")) + metricValue.chomp!("K") + metricValue = Float(metricValue) * 1000.0 ** 1 + elsif (metricValue.end_with?("M")) + metricValue.chomp!("M") + metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("G")) + metricValue.chomp!("G") + metricValue = Float(metricValue) * 1000.0 ** 3 + elsif (metricValue.end_with?("T")) + metricValue.chomp!("T") + metricValue = Float(metricValue) * 1000.0 ** 4 + elsif (metricValue.end_with?("P")) + metricValue.chomp!("P") + metricValue = Float(metricValue) * 1000.0 ** 5 + elsif (metricValue.end_with?("E")) + metricValue.chomp!("E") + metricValue = Float(metricValue) * 1000.0 ** 6 + elsif (metricValue.end_with?("Z")) + metricValue.chomp!("Z") + metricValue = Float(metricValue) * 1000.0 ** 7 + elsif (metricValue.end_with?("Y")) + metricValue.chomp!("Y") + metricValue = Float(metricValue) * 1000.0 ** 8 + else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) + end + when "cpu" #convert to nanocores for cpu + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ + if (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) * 1000.0 ** 3 + end + else + @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") + metricValue = 0 + end #case statement + rescue => error + @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") + return 0 + end + return metricValue + end # getMetricNumericValue + end +end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb new file mode 100644 index 000000000..1ec2e25d4 --- /dev/null +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -0,0 +1,89 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + class Win_CAdvisor_Perf_Input < Input + Plugin.register_input("wincadvisorperf", self) + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.wincadvisorperf" + # config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + metricData = CAdvisorMetricsAPIClient.getMetrics() + metricData.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + #router.emit(@tag, time, record) if record + end + + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data for windows nodes: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_win_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_win_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics for windows nodes: #{errorStr}" + end + end + @mutex.lock + end + @mutex.unlock + end + end # Win_CAdvisor_Perf_Input +end # module From 08e5fd1032a77da2820e2df60914dbad98431931 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 7 Mar 2019 22:41:17 -0800 Subject: [PATCH 02/38] changes --- source/code/plugin/in_win_cadvisor_perf.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 1ec2e25d4..4b4c74a15 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -11,6 +11,7 @@ def initialize require "json" require_relative "CAdvisorMetricsAPIClient" + require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" end @@ -46,6 +47,7 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new + winNodes = KubernetesApiClient.getWindowsNodes() metricData = CAdvisorMetricsAPIClient.getMetrics() metricData.each do |record| record["DataType"] = "LINUX_PERF_BLOB" From a6e92a2aff54564e12ba9cacecc0381d9345f982 Mon Sep 17 00:00:00 2001 From: rashmy Date: Fri, 8 Mar 2019 18:08:30 -0800 Subject: [PATCH 03/38] changes --- installer/conf/kube.conf | 6 +- .../code/plugin/CAdvisorMetricsAPIClient.rb | 831 +++++++++--------- source/code/plugin/KubernetesApiClient.rb | 9 +- source/code/plugin/in_cadvisor_perf.rb | 152 ++-- source/code/plugin/in_win_cadvisor_perf.rb | 34 +- 5 files changed, 521 insertions(+), 511 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 8a82af04b..f8aa67800 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -50,7 +50,7 @@ #cadvisor perf- Windows nodes type wincadvisorperf - tag oms.containerinsights.wincadvisorperf + tag oms.api.wincadvisorperf run_interval 60s log_level debug @@ -177,13 +177,13 @@ retry_mdm_post_wait_minutes 60 - + type out_oms log_level debug num_threads 5 buffer_chunk_limit 20m buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_wincadvisorperf*.buffer + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk flush_interval 20s diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 3c36775af..a61b08618 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -2,424 +2,433 @@ # frozen_string_literal: true class CAdvisorMetricsAPIClient - - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'date' - - require_relative 'oms_common' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@rxBytesLast = nil - @@rxBytesTimeLast = nil - @@txBytesLast = nil - @@txBytesTimeLast = nil - @@nodeCpuUsageNanoSecondsLast = nil - @@nodeCpuUsageNanoSecondsTimeLast = nil - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - - - def initialize - end - - class << self - def getSummaryStatsFromCAdvisor() - headers = {} - response = nil - @Log.info 'Getting CAdvisor Uri' - begin - cAdvisorUri = getCAdvisorUri() - if !cAdvisorUri.nil? - uri = URI.parse(cAdvisorUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = false - - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" - end - rescue => error - @Log.warn("CAdvisor api request failed: #{error}") - end - return response - end - - def getCAdvisorUri() - begin - defaultHost = "http://localhost:10255" - relativeUri = "/stats/summary" - nodeIP = ENV['NODE_IP'] - if !nodeIP.nil? - @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") - return "http://#{nodeIP}:10255" + relativeUri - else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") - return defaultHost + relativeUri - end - end - end - - def getMetrics() - metricDataItems = [] - begin - hostName = (OMS::Common.get_hostname) - metricInfo = JSON.parse(getSummaryStatsFromCAdvisor().body) - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores","cpuUsageNanoCores")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") - if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? - metricDataItems.push(cpuUsageNanoSecondsRate) - end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) - - networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - metricDataItems.push(networkRxRate) - end - networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - metricDataItems.push(networkTxRate) - end - - - rescue => error - @Log.warn("getContainerMetrics failed: #{error}") - return metricDataItems - end - return metricDataItems - end + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "date" - def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - - if (!pod['containers'].nil?) - pod['containers'].each do |container| - #cpu metric - containerName = container['name'] - metricValue = container['cpu'][cpuMetricNameToCollect] - metricTime = container['cpu']['time'] - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("cpuUsageNanoCores")) - - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") - return metricItems - end - return metricItems - end + require_relative "oms_common" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" - def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['memory'][memoryMetricNameToCollect] - metricTime = container['memory']['time'] - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("memoryRssBytes")) - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") - @Log.warn metricJSON - return metricItems - end - return metricItems - end + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@rxBytesLast = nil + @@rxBytesTimeLast = nil + @@txBytesLast = nil + @@txBytesTimeLast = nil + @@nodeCpuUsageNanoSecondsLast = nil + @@nodeCpuUsageNanoSecondsTimeLast = nil + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return metricItem - end - return metricItem - end + def initialize + end - def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds" ) - @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") - return nil - elsif metricNameToCollect == "rxBytes" - if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@rxBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - metricValue = metricRateValue - end - elsif metricNameToCollect == "txBytes" - if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@txBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - metricValue = metricRateValue - end - else - if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - metricValue = metricRateValue - end - end - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return nil - end - return metricItem - end + class << self + def getSummaryStatsFromCAdvisor(winNode) + headers = {} + response = nil + @Log.info "Getting CAdvisor Uri" + begin + cAdvisorUri = getCAdvisorUri(winNode) + if !cAdvisorUri.nil? + uri = URI.parse(cAdvisorUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = false + + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + rescue => error + @Log.warn("CAdvisor api request failed: #{error}") + end + return response + end + + def getCAdvisorUri(winNode) + begin + defaultHost = "http://localhost:10255" + relativeUri = "/stats/summary" + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + if !nodeIP.nil? + @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") + return "http://#{nodeIP}:10255" + relativeUri + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") + if !winNode.nil? + return nil + else + return defaultHost + relativeUri + end + end + end + end + + def getMetrics(winNode = nil) + metricDataItems = [] + begin + if !winNode.nil? + hostName = winNode["Hostname"] + else + hostName = (OMS::Common.get_hostname) + end + metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) + #@Log.info "metric info: #{metricInfo}" + if !metricInfo.nil? + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) + + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) - def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node['startTime'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - #Read it from /proc/uptime - metricCollections['Value'] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") - @Log.warn metricJSON - return metricItem - end - return metricItem + networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + metricDataItems.push(networkRxRate) + end + networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + metricDataItems.push(networkTxRate) + end + else + @Log.warn("Couldn't get metric information for host: #{hostName}") + end + rescue => error + @Log.warn("getContainerMetrics failed: #{error}") + return metricDataItems + end + return metricDataItems + end + + def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("cpuUsageNanoCores")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + + def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["memory"][memoryMetricNameToCollect] + metricTime = container["memory"]["time"] + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['startTime'] - metricTime = currentTime - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = DateTime.parse(metricValue).to_time.to_i - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - rescue => error - @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") - @Log.warn metricJSON - return metricItems - end - return metricItems + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("memoryRssBytes")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + + def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") + @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") + return nil + elsif metricNameToCollect == "rxBytes" + if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true + @@rxBytesLast = metricValue + @@rxBytesTimeLast = metricTime + return nil + else + metricRateValue = ((metricValue - @@rxBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + @@rxBytesLast = metricValue + @@rxBytesTimeLast = metricTime + metricValue = metricRateValue + end + elsif metricNameToCollect == "txBytes" + if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true + @@txBytesLast = metricValue + @@txBytesTimeLast = metricTime + return nil + else + metricRateValue = ((metricValue - @@txBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + @@txBytesLast = metricValue + @@txBytesTimeLast = metricTime + metricValue = metricRateValue + end + else + if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + return nil + else + metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + metricValue = metricRateValue + end + end + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return nil + end + return metricItem + end + + def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + metricValue = node["startTime"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + #Read it from /proc/uptime + metricCollections["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + rescue => error + @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["startTime"] + metricTime = currentTime + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = DateTime.parse(metricValue).to_time.to_i + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end + end end + rescue => error + @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + end +end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 648084002..17e7adedb 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -224,21 +224,21 @@ def getPods(namespace) def getWindowsNodes winNodes = [] begin - # kubesystemResourceUri = "namespaces/" + namespace + "/pods" - # podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" if (!nodeInventory.empty?) nodeInventory["items"].each do |item| # check for windows operating system in node metadata winNode = {} nodeStatus = item["status"] if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] - if (operatingSystem.is_a? String && operatingSystem.casecmp("windows") == 0) + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) nodeStatusAddresses = nodeStatus["addresses"] if !nodeStatusAddresses.nil? nodeStatusAddresses.each do |address| - winNode[address["type"]] = address[address] + winNode[address["type"]] = address["address"] end winNodes.push(winNode) end @@ -250,7 +250,6 @@ def getWindowsNodes rescue => error @Log.warn("Error in get windows nodes: #{error}") end - return pods end def getContainerIDs(namespace) diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index a857aa6b9..f5f65f01b 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -2,90 +2,88 @@ # frozen_string_literal: true module Fluent - - class CAdvisor_Perf_Input < Input - Plugin.register_input('cadvisorperf', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'CAdvisorMetricsAPIClient' - require_relative 'oms_common' - require_relative 'omslog' - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.cadvisorperf" - config_param :mdmtag, :string, :default => "mdm.cadvisorperf" - - def configure (conf) - super + class CAdvisor_Perf_Input < Input + Plugin.register_input("cadvisorperf", self) + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + metricData = CAdvisorMetricsAPIClient.getMetrics() + metricData.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + #router.emit(@tag, time, record) if record end - end - - def enumerate() - time = Time.now.to_f - begin - eventStream = MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics() - metricData.each do |record| - record['DataType'] = "LINUX_PERF_BLOB" - record['IPName'] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) + + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" - end + @mutex.unlock + if !done + begin + $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" end - @mutex.lock end - @mutex.unlock + @mutex.lock end - end # CAdvisor_Perf_Input + @mutex.unlock + end + end # CAdvisor_Perf_Input end # module - diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 4b4c74a15..4cc265bcc 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -17,8 +17,7 @@ def initialize end config_param :run_interval, :time, :default => "1m" - config_param :tag, :string, :default => "oms.containerinsights.wincadvisorperf" - # config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + config_param :tag, :string, :default => "oms.api.wincadvisorperf" def configure(conf) super @@ -47,20 +46,25 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new + $log.info "in_win_cadvisor_perf : Getting windows nodes" winNodes = KubernetesApiClient.getWindowsNodes() - metricData = CAdvisorMetricsAPIClient.getMetrics() - metricData.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV["ISTEST"] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + $log.info "in_win_cadvisor_perf : Successuly got windows nodes" + winNodes.each do |winNode| + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) + # $log.info "windows node metric data: #{metricData}" + metricData.each do |record| + if !record.empty? + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + #$log.info "windows node record: #{record}" + end + end + router.emit_stream(@tag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data for windows nodes: #{errorStr}" From 8510d50a93d087b5b73f19cc121d13791fdc0578 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 12 Mar 2019 12:12:48 -0700 Subject: [PATCH 04/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 124 +++++++++++------- 1 file changed, 75 insertions(+), 49 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index a61b08618..3d41e647b 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -15,12 +15,14 @@ class CAdvisorMetricsAPIClient @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M - @@rxBytesLast = nil - @@rxBytesTimeLast = nil - @@txBytesLast = nil - @@txBytesTimeLast = nil + # @@rxBytesLast = nil + # @@rxBytesTimeLast = nil + # @@txBytesLast = nil + # @@txBytesTimeLast = nil @@nodeCpuUsageNanoSecondsLast = nil @@nodeCpuUsageNanoSecondsTimeLast = nil + @@winNodeCpuUsageNanoSecondsLast = {} + @@winNodeCpuUsageNanoSecondsTimeLast = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i @@ -77,8 +79,10 @@ def getMetrics(winNode = nil) begin if !winNode.nil? hostName = winNode["Hostname"] + operatingSystem = "Windows" else hostName = (OMS::Common.get_hostname) + operatingSystem = "Linux" end metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) #@Log.info "metric info: #{metricInfo}" @@ -88,24 +92,26 @@ def getMetrics(winNode = nil) metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem) if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? metricDataItems.push(cpuUsageNanoSecondsRate) end metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) - networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - metricDataItems.push(networkRxRate) - end - networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - metricDataItems.push(networkTxRate) - end + # Disabling networkRxRate and networkTxRate since we dont use it as of now. + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + # metricDataItems.push(networkRxRate) + # end + # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + # metricDataItems.push(networkTxRate) + # end else @Log.warn("Couldn't get metric information for host: #{hostName}") end @@ -280,7 +286,7 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, return metricItem end - def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, operatingSystem) metricItem = {} clusterId = KubernetesApiClient.getClusterId begin @@ -292,41 +298,61 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricValue = node[metricCategory][metricNameToCollect] metricTime = node[metricCategory]["time"] - if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") - @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") + # if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") + # @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") + if !(metricNameToCollect == "usageCoreNanoSeconds") + @Log.warn("getNodeMetricItemRate : rateMetric is supported only for usageCoreNanoSeconds and not for #{metricNameToCollect}") return nil - elsif metricNameToCollect == "rxBytes" - if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@rxBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - metricValue = metricRateValue - end - elsif metricNameToCollect == "txBytes" - if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@txBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - metricValue = metricRateValue - end + # elsif metricNameToCollect == "rxBytes" + # if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@rxBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # metricValue = metricRateValue + # end + # elsif metricNameToCollect == "txBytes" + # if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@txBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # metricValue = metricRateValue + # end else - if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - metricValue = metricRateValue + if (operatingSystem == "Linux") + if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + return nil + else + metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + metricValue = metricRateValue + end + elsif (operatingSystem == "Windows") + # Using the hash for windows nodes since this is running in replica set and there can be multiple nodes + if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + @Log.info "@@nodeCpuUsageNanoSecondsLast: #{@@nodeCpuUsageNanoSecondsLast}" + @Log.info "@@nodeCpuUsageNanoSecondsTimeLast: #{@@nodeCpuUsageNanoSecondsTimeLast}" + @Log.info "In condition 1" + @Log.info "metricValue :#{metricValue}" + @Log.info "metricTime :#{metricTime}" + return nil + else + metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time) + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + metricValue = metricRateValue end end metricItem["DataItems"] = [] From c144b6705d2714de420641c071c7c8ae4e1f676c Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 12 Mar 2019 16:18:34 -0700 Subject: [PATCH 05/38] node cpu metric rate changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 3d41e647b..c1ba7f5da 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -326,7 +326,7 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl # metricValue = metricRateValue # end else - if (operatingSystem == "Linux") + if operatingSystem == "Linux" if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true @@nodeCpuUsageNanoSecondsLast = metricValue @@nodeCpuUsageNanoSecondsTimeLast = metricTime @@ -337,22 +337,31 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl @@nodeCpuUsageNanoSecondsTimeLast = metricTime metricValue = metricRateValue end - elsif (operatingSystem == "Windows") - # Using the hash for windows nodes since this is running in replica set and there can be multiple nodes - if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true - @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue - @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime - @Log.info "@@nodeCpuUsageNanoSecondsLast: #{@@nodeCpuUsageNanoSecondsLast}" - @Log.info "@@nodeCpuUsageNanoSecondsTimeLast: #{@@nodeCpuUsageNanoSecondsTimeLast}" - @Log.info "In condition 1" - @Log.info "metricValue :#{metricValue}" - @Log.info "metricTime :#{metricTime}" - return nil - else - metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time) - @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue - @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime - metricValue = metricRateValue + elsif operatingSystem == "Windows" + # Using the hash for windows nodes since this is running in replica set and there can be multiple nodes + if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" + # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" + # @Log.info "In condition 1" + # @Log.info "metricValue :#{metricValue}" + # @Log.info "metricTime :#{metricTime}" + return nil + else + metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time) + # @Log.info "In condition 2" + # @Log.info "metricValue :#{metricValue}" + # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" + # @Log.info "metricTime :#{metricTime}" + # @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" + # @Log.info "@@winNodeCpuUsageNanoSecondsTimeLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsTimeLast[hostName]}" + # @Log.info "DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[#{hostName}].to_time: #{DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time}" + # @Log.info "metricRateValue: #{metricRateValue}" + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + metricValue = metricRateValue + end end end metricItem["DataItems"] = [] From bcdf0c3aa335ea1082ec52ab1e453bd0dbe8b5ae Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 12 Mar 2019 18:53:41 -0700 Subject: [PATCH 06/38] container cpu rate --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 101 +++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index c1ba7f5da..b814eb46e 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -23,6 +23,8 @@ class CAdvisorMetricsAPIClient @@nodeCpuUsageNanoSecondsTimeLast = nil @@winNodeCpuUsageNanoSecondsLast = {} @@winNodeCpuUsageNanoSecondsTimeLast = {} + @@winContainerCpuUsageNanoSecondsLast = {} + @@winContainerCpuUsageNanoSecondsTimeLast = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i @@ -87,11 +89,19 @@ def getMetrics(winNode = nil) metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) #@Log.info "metric info: #{metricInfo}" if !metricInfo.nil? - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) + if operatingSystem == "Linux" + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) + elsif operatingSystem == "Windows" + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") + if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? + metricDataItems.push(containerCpuUsageNanoSecondsRate) + end + end + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem) if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? metricDataItems.push(cpuUsageNanoSecondsRate) @@ -186,6 +196,95 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met return metricItems end + # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds + def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + + containerId = podUid + "/" + containerName + if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + @Log.info "In condition 1" + @Log.info "metricValue :#{metricValue}" + @Log.info "metricTime :#{metricTime}" + return nil + else + metricRateValue = ((metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time) + @Log.info "In condition 2" + @Log.info "metricValue :#{metricValue}" + @Log.info "@@winContainerCpuUsageNanoSecondsLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsLast[containerId]}" + @Log.info "metricTime :#{metricTime}" + @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" + @Log.info "@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsTimeLast[containerId]}" + @Log.info "DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}].to_time: #{DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time}" + @Log.info "metricRateValue: #{metricRateValue}" + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + metricValue = metricRateValue + end + + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("cpuUsageNanoCores")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) metricItems = [] clusterId = KubernetesApiClient.getClusterId From 89f5a00eb174d5e68136efe86fc3e0f2d3a89062 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 13 Mar 2019 16:49:57 -0700 Subject: [PATCH 07/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 37 ++++++++++++------- source/code/plugin/in_win_cadvisor_perf.rb | 10 ++++- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index b814eb46e..3f4f66a00 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -90,15 +90,16 @@ def getMetrics(winNode = nil) #@Log.info "metric info: #{metricInfo}" if !metricInfo.nil? metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) + # metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) if operatingSystem == "Linux" metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) elsif operatingSystem == "Windows" containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? - metricDataItems.push(containerCpuUsageNanoSecondsRate) + metricDataItems.concat(containerCpuUsageNanoSecondsRate) end end @@ -167,6 +168,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + # @Log.info "Container : countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -232,20 +234,20 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime - @Log.info "In condition 1" - @Log.info "metricValue :#{metricValue}" - @Log.info "metricTime :#{metricTime}" + # @Log.info "In condition 1" + # @Log.info "metricValue :#{metricValue}" + # @Log.info "metricTime :#{metricTime}" return nil else metricRateValue = ((metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time) - @Log.info "In condition 2" - @Log.info "metricValue :#{metricValue}" - @Log.info "@@winContainerCpuUsageNanoSecondsLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsLast[containerId]}" - @Log.info "metricTime :#{metricTime}" - @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" - @Log.info "@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsTimeLast[containerId]}" - @Log.info "DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}].to_time: #{DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time}" - @Log.info "metricRateValue: #{metricRateValue}" + # @Log.info "In condition 2" + # @Log.info "metricValue :#{metricValue}" + # @Log.info "@@winContainerCpuUsageNanoSecondsLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsLast[containerId]}" + # @Log.info "metricTime :#{metricTime}" + # @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" + # @Log.info "@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsTimeLast[containerId]}" + # @Log.info "DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}].to_time: #{DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time}" + # @Log.info "metricRateValue: #{metricRateValue}" @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime metricValue = metricRateValue @@ -256,6 +258,8 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + # @Log.info "Container: countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" + @Log.info "#{metricItem}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -319,6 +323,8 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + @Log.info "#{metricItem}" + # @Log.info "Container - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -376,6 +382,7 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) + # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" end rescue => error @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") @@ -477,7 +484,9 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricCollections["Value"] = metricValue metricProps["Collections"].push(metricCollections) + # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" metricItem["DataItems"].push(metricProps) + @Log.info "#{metricItem}" end rescue => error @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") @@ -515,6 +524,7 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) + # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" rescue => error @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") @Log.warn metricJSON @@ -554,6 +564,7 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + # @Log.info "Container - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" end end end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 4cc265bcc..7cd34eb16 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -46,21 +46,29 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new + # $log.info "1" $log.info "in_win_cadvisor_perf : Getting windows nodes" winNodes = KubernetesApiClient.getWindowsNodes() + # $log.info "2" $log.info "in_win_cadvisor_perf : Successuly got windows nodes" winNodes.each do |winNode| + # $log.info "3" metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) + # $log.info "4" # $log.info "windows node metric data: #{metricData}" metricData.each do |record| + # $log.info "5" if !record.empty? + # $log.info "6" record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" eventStream.add(time, record) if record - #$log.info "windows node record: #{record}" + $log.info "windows node record: #{record}" + # $log.info "7" end end router.emit_stream(@tag, eventStream) if eventStream + # $log.info "8" @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") From 96e6eca6cd6d28d4258b6ad431046b0c60f385d7 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 13 Mar 2019 18:16:26 -0700 Subject: [PATCH 08/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 21 ++++++++++++++++--- source/code/plugin/in_win_cadvisor_perf.rb | 13 +++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 3f4f66a00..810935f01 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -239,7 +239,12 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, # @Log.info "metricTime :#{metricTime}" return nil else - metricRateValue = ((metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time) + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time + if timeDifference != 0 + metricRateValue = ((metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId]) * 1.0) / timeDifference + else + metricRateValue = 0 + end # @Log.info "In condition 2" # @Log.info "metricValue :#{metricValue}" # @Log.info "@@winContainerCpuUsageNanoSecondsLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsLast[containerId]}" @@ -438,7 +443,12 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl @@nodeCpuUsageNanoSecondsTimeLast = metricTime return nil else - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time + if timeDifference != 0 + metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / timeDifference + else + metricRateValue = 0 + end @@nodeCpuUsageNanoSecondsLast = metricValue @@nodeCpuUsageNanoSecondsTimeLast = metricTime metricValue = metricRateValue @@ -455,7 +465,12 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl # @Log.info "metricTime :#{metricTime}" return nil else - metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time) + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time + if timeDifference != 0 + metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / timeDifference + else + metricRateValue = 0 + end # @Log.info "In condition 2" # @Log.info "metricValue :#{metricValue}" # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 7cd34eb16..aba04841a 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -5,6 +5,8 @@ module Fluent class Win_CAdvisor_Perf_Input < Input Plugin.register_input("wincadvisorperf", self) + @@winNodes = [] + def initialize super require "yaml" @@ -29,6 +31,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end end @@ -48,10 +51,14 @@ def enumerate() eventStream = MultiEventStream.new # $log.info "1" $log.info "in_win_cadvisor_perf : Getting windows nodes" - winNodes = KubernetesApiClient.getWindowsNodes() + timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@winNodes = KubernetesApiClient.getWindowsNodes() + $log.info "in_win_cadvisor_perf : Successuly got windows nodes" + end # $log.info "2" - $log.info "in_win_cadvisor_perf : Successuly got windows nodes" - winNodes.each do |winNode| + @@winNodes.each do |winNode| # $log.info "3" metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) # $log.info "4" From 542d0a9ff10573a3aa328941463cb9c1e859d8c7 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 13 Mar 2019 21:16:55 -0700 Subject: [PATCH 09/38] changes --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 810935f01..9e27b8cb8 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -96,6 +96,7 @@ def getMetrics(winNode = nil) if operatingSystem == "Linux" metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) elsif operatingSystem == "Windows" containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? @@ -108,7 +109,7 @@ def getMetrics(winNode = nil) metricDataItems.push(cpuUsageNanoSecondsRate) end metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) + # metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) From 0d4601a81c8715c70b70190888f37deb06b698ce Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 14 Mar 2019 16:00:25 -0700 Subject: [PATCH 10/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 61 +--- source/code/plugin/in_containerinventory.rb | 179 +++++----- source/code/plugin/in_kube_nodes.rb | 318 +++++++++--------- 3 files changed, 257 insertions(+), 301 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 9e27b8cb8..50d89298a 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -27,6 +27,7 @@ class CAdvisorMetricsAPIClient @@winContainerCpuUsageNanoSecondsTimeLast = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -202,6 +203,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] + containerCount = 0 clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@ -215,6 +217,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if (!pod["containers"].nil?) pod["containers"].each do |container| #cpu metric + containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = container["cpu"]["time"] @@ -235,9 +238,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime - # @Log.info "In condition 1" - # @Log.info "metricValue :#{metricValue}" - # @Log.info "metricTime :#{metricTime}" return nil else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time @@ -246,47 +246,28 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, else metricRateValue = 0 end - # @Log.info "In condition 2" - # @Log.info "metricValue :#{metricValue}" - # @Log.info "@@winContainerCpuUsageNanoSecondsLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsLast[containerId]}" - # @Log.info "metricTime :#{metricTime}" - # @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" - # @Log.info "@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}]: #{@@winContainerCpuUsageNanoSecondsTimeLast[containerId]}" - # @Log.info "DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[#{containerId}].to_time: #{DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time}" - # @Log.info "metricRateValue: #{metricRateValue}" @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime metricValue = metricRateValue end metricCollections["Value"] = metricValue - metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - # @Log.info "Container: countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" - @Log.info "#{metricItem}" - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("cpuUsageNanoCores")) - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps["PodName"] = podName - telemetryProps["ContainerName"] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") - end end end end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + #Sending ContainerInventoryTelemetry from replicaset for telemetry purposes + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostname + telemetryProperties["ContainerCount"] = containerCount + # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. + ApplicationInsightsUtility.sendTelemetry("ContainerInventory", telemetryProperties) end rescue => error @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") @@ -388,7 +369,6 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) - # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" end rescue => error @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") @@ -459,11 +439,6 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime - # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" - # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" - # @Log.info "In condition 1" - # @Log.info "metricValue :#{metricValue}" - # @Log.info "metricTime :#{metricTime}" return nil else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time @@ -472,14 +447,6 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl else metricRateValue = 0 end - # @Log.info "In condition 2" - # @Log.info "metricValue :#{metricValue}" - # @Log.info "@@winNodeCpuUsageNanoSecondsLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsLast[hostName]}" - # @Log.info "metricTime :#{metricTime}" - # @Log.info "DateTime.parse(metricTime).to_time: #{DateTime.parse(metricTime).to_time}" - # @Log.info "@@winNodeCpuUsageNanoSecondsTimeLast[#{hostName}]: #{@@winNodeCpuUsageNanoSecondsTimeLast[hostName]}" - # @Log.info "DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[#{hostName}].to_time: #{DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time}" - # @Log.info "metricRateValue: #{metricRateValue}" @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime metricValue = metricRateValue @@ -500,7 +467,6 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricCollections["Value"] = metricValue metricProps["Collections"].push(metricCollections) - # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" metricItem["DataItems"].push(metricProps) @Log.info "#{metricItem}" end @@ -540,7 +506,6 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) - # @Log.info "Node - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" rescue => error @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") @Log.warn metricJSON diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index f501421a2..4d83278a9 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Container_Inventory_Input < Input - Plugin.register_input('containerinventory', self) + Plugin.register_input("containerinventory", self) - @@PluginName = 'ContainerInventory' - @@RunningState = 'Running' - @@FailedState = 'Failed' - @@StoppedState = 'Stopped' - @@PausedState = 'Paused' + @@PluginName = "ContainerInventory" + @@RunningState = "Running" + @@FailedState = "Failed" + @@StoppedState = "Stopped" + @@PausedState = "Paused" def initialize super - require 'json' - require_relative 'DockerApiClient' - require_relative 'ContainerInventoryState' - require_relative 'ApplicationInsightsUtility' - require_relative 'omslog' + require "json" + require_relative "DockerApiClient" + require_relative "ContainerInventoryState" + require_relative "ApplicationInsightsUtility" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.containerinventory" - - def configure (conf) + + def configure(conf) super end @@ -50,16 +49,16 @@ def shutdown def obtainContainerConfig(instance, container) begin - configValue = container['Config'] + configValue = container["Config"] if !configValue.nil? - instance['ContainerHostname'] = configValue['Hostname'] + instance["ContainerHostname"] = configValue["Hostname"] - envValue = configValue['Env'] + envValue = configValue["Env"] envValueString = (envValue.nil?) ? "" : envValue.to_s # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) envValueString = ["AZMON_COLLECT_ENV=FALSE"] - $log.warn("Environment Variable collection for container: #{container['Id']} skipped because AZMON_COLLECT_ENV is set to false") + $log.warn("Environment Variable collection for container: #{container["Id"]} skipped because AZMON_COLLECT_ENV is set to false") end # Restricting the ENV string value to 200kb since the size of this string can go very high if envValueString.length > 200000 @@ -68,88 +67,88 @@ def obtainContainerConfig(instance, container) if !lastIndex.nil? envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" end - instance['EnvironmentVar'] = envValueStringTruncated + instance["EnvironmentVar"] = envValueStringTruncated else - instance['EnvironmentVar'] = envValueString + instance["EnvironmentVar"] = envValueString end - cmdValue = configValue['Cmd'] + cmdValue = configValue["Cmd"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s - instance['Command'] = cmdValueString + instance["Command"] = cmdValueString - instance['ComposeGroup'] = "" - labelsValue = configValue['Labels'] + instance["ComposeGroup"] = "" + labelsValue = configValue["Labels"] if !labelsValue.nil? && !labelsValue.empty? - instance['ComposeGroup'] = labelsValue['com.docker.compose.project'] + instance["ComposeGroup"] = labelsValue["com.docker.compose.project"] end else - $log.warn("Attempt in ObtainContainerConfig to get container: #{container['Id']} config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerConfig: #{errorStr}") + $log.warn("Attempt in ObtainContainerConfig to get container: #{container["Id"]} config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerConfig: #{errorStr}") + end end def obtainContainerState(instance, container) begin - stateValue = container['State'] + stateValue = container["State"] if !stateValue.nil? - exitCodeValue = stateValue['ExitCode'] + exitCodeValue = stateValue["ExitCode"] # Exit codes less than 0 are not supported by the engine if exitCodeValue < 0 - exitCodeValue = 128 - $log.info("obtainContainerState::Container: #{container['Id']} returned negative exit code") + exitCodeValue = 128 + $log.info("obtainContainerState::Container: #{container["Id"]} returned negative exit code") end - instance['ExitCode'] = exitCodeValue + instance["ExitCode"] = exitCodeValue if exitCodeValue > 0 - instance['State'] = @@FailedState + instance["State"] = @@FailedState else # Set the Container status : Running/Paused/Stopped - runningValue = stateValue['Running'] + runningValue = stateValue["Running"] if runningValue - pausedValue = stateValue['Paused'] + pausedValue = stateValue["Paused"] # Checking for paused within running is true state because docker returns true for both Running and Paused fields when the container is paused if pausedValue - instance['State'] = @@PausedState + instance["State"] = @@PausedState else - instance['State'] = @@RunningState + instance["State"] = @@RunningState end else - instance['State'] = @@StoppedState + instance["State"] = @@StoppedState end end - instance['StartedTime'] = stateValue['StartedAt'] - instance['FinishedTime'] = stateValue['FinishedAt'] + instance["StartedTime"] = stateValue["StartedAt"] + instance["FinishedTime"] = stateValue["FinishedAt"] else - $log.info("Attempt in ObtainContainerState to get container: #{container['Id']} state information returned null") + $log.info("Attempt in ObtainContainerState to get container: #{container["Id"]} state information returned null") end - rescue => errorStr - $log.warn("Exception in obtainContainerState: #{errorStr}") + rescue => errorStr + $log.warn("Exception in obtainContainerState: #{errorStr}") end end def obtainContainerHostConfig(instance, container) begin - hostConfig = container['HostConfig'] + hostConfig = container["HostConfig"] if !hostConfig.nil? - links = hostConfig['Links'] - instance['Links'] = "" + links = hostConfig["Links"] + instance["Links"] = "" if !links.nil? linksString = links.to_s - instance['Links'] = (linksString == "null")? "" : linksString + instance["Links"] = (linksString == "null") ? "" : linksString end - portBindings = hostConfig['PortBindings'] - instance['Ports'] = "" + portBindings = hostConfig["PortBindings"] + instance["Ports"] = "" if !portBindings.nil? portBindingsString = portBindings.to_s - instance['Ports'] = (portBindingsString == "null")? "" : portBindingsString + instance["Ports"] = (portBindingsString == "null") ? "" : portBindingsString end else - $log.info("Attempt in ObtainContainerHostConfig to get container: #{container['Id']} host config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + $log.info("Attempt in ObtainContainerHostConfig to get container: #{container["Id"]} host config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + end end def inspectContainer(id, nameMap) @@ -157,29 +156,29 @@ def inspectContainer(id, nameMap) begin container = DockerApiClient.dockerInspectContainer(id) if !container.nil? && !container.empty? - containerInstance['InstanceID'] = container['Id'] - containerInstance['CreatedTime'] = container['Created'] - containerName = container['Name'] + containerInstance["InstanceID"] = container["Id"] + containerInstance["CreatedTime"] = container["Created"] + containerName = container["Name"] if !containerName.nil? && !containerName.empty? # Remove the leading / from the name if it exists (this is an API issue) - containerInstance['ElementName'] = (containerName[0] == '/') ? containerName[1..-1] : containerName + containerInstance["ElementName"] = (containerName[0] == "/") ? containerName[1..-1] : containerName end - imageValue = container['Image'] + imageValue = container["Image"] if !imageValue.nil? && !imageValue.empty? - containerInstance['ImageId'] = imageValue + containerInstance["ImageId"] = imageValue repoImageTagArray = nameMap[imageValue] if nameMap.has_key? imageValue - containerInstance['Repository'] = repoImageTagArray[0] - containerInstance['Image'] = repoImageTagArray[1] - containerInstance['ImageTag'] = repoImageTagArray[2] + containerInstance["Repository"] = repoImageTagArray[0] + containerInstance["Image"] = repoImageTagArray[1] + containerInstance["ImageTag"] = repoImageTagArray[2] end end - obtainContainerConfig(containerInstance, container); - obtainContainerState(containerInstance, container); - obtainContainerHostConfig(containerInstance, container); + obtainContainerConfig(containerInstance, container) + obtainContainerState(containerInstance, container) + obtainContainerHostConfig(containerInstance, container) end rescue => errorStr - $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") + $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") end return containerInstance end @@ -199,8 +198,8 @@ def enumerate containerIds.each do |containerId| inspectedContainer = {} inspectedContainer = inspectContainer(containerId, nameMap) - inspectedContainer['Computer'] = hostname - inspectedContainer['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated + inspectedContainer["Computer"] = hostname + inspectedContainer["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventory.push inspectedContainer ContainerInventoryState.writeContainerState(inspectedContainer) end @@ -210,8 +209,8 @@ def enumerate deletedContainers.each do |deletedContainer| container = ContainerInventoryState.readContainerState(deletedContainer) if !container.nil? - container.each{|k,v| container[k]=v} - container['State'] = "Deleted" + container.each { |k, v| container[k] = v } + container["State"] = "Deleted" containerInventory.push container end end @@ -219,28 +218,28 @@ def enumerate containerInventory.each do |record| wrapper = { - "DataType"=>"CONTAINER_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("containerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - @@telemetryTimeTracker = DateTime.now.to_time.to_i - telemetryProperties = {} - telemetryProperties['Computer'] = hostname - telemetryProperties['ContainerCount'] = containerInventory.length - ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) - end $log.info("in_container_inventory::enumerate : Processing complete - emitted stream @ #{Time.now.utc.iso8601}") end + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostname + telemetryProperties["ContainerCount"] = containerInventory.length + ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) + end rescue => errorStr $log.warn("Exception in enumerate container inventory: #{errorStr}") end @@ -265,7 +264,5 @@ def run_periodic end @mutex.unlock end - end # Container_Inventory_Input - -end # module \ No newline at end of file +end # module diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index ba1dacbe0..556d51821 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -2,181 +2,175 @@ # frozen_string_literal: true module Fluent + class Kube_nodeInventory_Input < Input + Plugin.register_input("kubenodeinventory", self) - class Kube_nodeInventory_Input < Input - Plugin.register_input('kubenodeinventory', self) - - @@ContainerNodeInventoryTag = 'oms.containerinsights.ContainerNodeInventory' - @@MDMKubeNodeInventoryTag = 'mdm.kubenodeinventory' + @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" + @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + def initialize + super + require "yaml" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - telemetrySent = false - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - begin - if(!nodeInventory.empty?) - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - #get node inventory - nodeInventory['items'].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord['Computer'] = items['metadata']['name'] + end - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Computer'] = items['metadata']['name'] - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterId'] = KubernetesApiClient.getClusterId - record['CreationTimeStamp'] = items['metadata']['creationTimestamp'] - record['Labels'] = [items['metadata']['labels']] - record['Status'] = "" + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items['status'].key?("conditions") && !items['status']['conditions'].empty? - allNodeConditions="" - items['status']['conditions'].each do |condition| - if condition['status'] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition['type'] - else - allNodeConditions = condition['type'] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition['type'] == "Ready" && !condition['lastTransitionTime'].nil? - record['LastTransitionTimeReady'] = condition['lastTransitionTime'] - end - end - if !allNodeConditions.empty? - record['Status'] = allNodeConditions - end + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + telemetrySent = false + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + begin + if (!nodeInventory.empty?) + eventStream = MultiEventStream.new + containerNodeInventoryEventStream = MultiEventStream.new + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - end + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" - nodeInfo = items['status']['nodeInfo'] - record['KubeletVersion'] = nodeInfo['kubeletVersion'] - record['KubeProxyVersion'] = nodeInfo['kubeProxyVersion'] - containerNodeInventoryRecord['OperatingSystem'] = nodeInfo['osImage'] - dockerVersion = nodeInfo['containerRuntimeVersion'] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord['DockerVersion'] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType"=>"CONTAINER_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[containerNodeInventoryRecord.each{|k,v| containerNodeInventoryRecord[k]=v}] - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. - wrapper = { - "DataType"=>"KUBE_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - capacityInfo = items['status']['capacity'] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"] , properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"] , properties) - telemetrySent = true - end - end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - if telemetrySent == true - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end - end - rescue => errorStr - $log.warn "Failed to retrieve node inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end end + + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + telemetrySent = true + end + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + if telemetrySent == true + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + end + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - @mutex.lock end + rescue => errorStr + $log.warn "Failed to retrieve node inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished @mutex.unlock + if !done + begin + $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock end - - end # Kube_Node_Input - - end # module - - \ No newline at end of file + @mutex.unlock + end + end # Kube_Node_Input +end # module From fe3d8a07e65481b821e236c7b9b7d33a35ce08d0 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 14 Mar 2019 16:05:57 -0700 Subject: [PATCH 11/38] changes --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 3 --- source/code/plugin/in_win_cadvisor_perf.rb | 12 +----------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 50d89298a..f42865ef8 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -170,7 +170,6 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - # @Log.info "Container : countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -311,7 +310,6 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) @Log.info "#{metricItem}" - # @Log.info "Container - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -545,7 +543,6 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - # @Log.info "Container - countername: #{metricCollections["CounterName"]}, countervalue: #{metricCollections["Value"]}" end end end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index aba04841a..48a127dde 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -49,33 +49,23 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new - # $log.info "1" $log.info "in_win_cadvisor_perf : Getting windows nodes" timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= 5) @@winNodes = KubernetesApiClient.getWindowsNodes() - $log.info "in_win_cadvisor_perf : Successuly got windows nodes" + $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" end - # $log.info "2" @@winNodes.each do |winNode| - # $log.info "3" metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) - # $log.info "4" - # $log.info "windows node metric data: #{metricData}" metricData.each do |record| - # $log.info "5" if !record.empty? - # $log.info "6" record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" eventStream.add(time, record) if record - $log.info "windows node record: #{record}" - # $log.info "7" end end router.emit_stream(@tag, eventStream) if eventStream - # $log.info "8" @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") From a6e29b06f05d3061b4f641c889a8b9b3a0b1ee24 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 14 Mar 2019 16:12:38 -0700 Subject: [PATCH 12/38] changes to include in_win_cadvisor_perf.rb file --- installer/datafiles/base_container.data | 1 + 1 file changed, 1 insertion(+) diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index c263aa505..9c4d563f8 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -34,6 +34,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/code/plugin/in_cadvisor_perf.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/code/plugin/in_win_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root From 99a241b5227b5d2ec12c391ae7325ff6f66bf255 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 14 Mar 2019 21:28:39 -0700 Subject: [PATCH 13/38] send containerinventoryheartbeatevent --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index f42865ef8..29103eb9f 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -202,7 +202,6 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] - containerCount = 0 clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@ -216,7 +215,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if (!pod["containers"].nil?) pod["containers"].each do |container| #cpu metric - containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = container["cpu"]["time"] @@ -264,9 +262,8 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @@telemetryTimeTracker = DateTime.now.to_time.to_i telemetryProperties = {} telemetryProperties["Computer"] = hostname - telemetryProperties["ContainerCount"] = containerCount # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. - ApplicationInsightsUtility.sendTelemetry("ContainerInventory", telemetryProperties) + ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) end rescue => error @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") From f3f6fb4564cd240c36bfb0a51074979d4ca3a70d Mon Sep 17 00:00:00 2001 From: rashmy Date: Fri, 15 Mar 2019 15:20:55 -0700 Subject: [PATCH 14/38] changes --- .../code/plugin/ApplicationInsightsUtility.rb | 372 +++++++++--------- .../code/plugin/CAdvisorMetricsAPIClient.rb | 78 +++- source/code/plugin/in_kube_podinventory.rb | 261 ++++++------ 3 files changed, 377 insertions(+), 334 deletions(-) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 5c5e92a6c..d0422487b 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -2,209 +2,215 @@ # frozen_string_literal: true class ApplicationInsightsUtility - require_relative 'lib/application_insights' - require_relative 'omslog' - require_relative 'DockerApiClient' - require_relative 'oms_common' - require 'json' - require 'base64' + require_relative "lib/application_insights" + require_relative "omslog" + require_relative "DockerApiClient" + require_relative "oms_common" + require "json" + require "base64" - @@HeartBeat = 'HeartBeatEvent' - @@Exception = 'ExceptionEvent' - @@AcsClusterType = 'ACS' - @@AksClusterType = 'AKS' - @OmsAdminFilePath = '/etc/opt/microsoft/omsagent/conf/omsadmin.conf' - @@EnvAcsResourceName = 'ACS_RESOURCE_NAME' - @@EnvAksRegion = 'AKS_REGION' - @@EnvAgentVersion = 'AGENT_VERSION' - @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' - @@EnvControllerType = 'CONTROLLER_TYPE' + @@HeartBeat = "HeartBeatEvent" + @@Exception = "ExceptionEvent" + @@AcsClusterType = "ACS" + @@AksClusterType = "AKS" + @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf" + @@EnvAcsResourceName = "ACS_RESOURCE_NAME" + @@EnvAksRegion = "AKS_REGION" + @@EnvAgentVersion = "AGENT_VERSION" + @@EnvApplicationInsightsKey = "APPLICATIONINSIGHTS_AUTH" + @@EnvControllerType = "CONTROLLER_TYPE" - @@CustomProperties = {} - @@Tc = nil - @@hostName = (OMS::Common.get_hostname) + @@CustomProperties = {} + @@Tc = nil + @@hostName = (OMS::Common.get_hostname) - def initialize - end + def initialize + end - class << self - #Set default properties for telemetry event - def initializeUtility() - begin - resourceInfo = ENV['AKS_RESOURCE_ID'] - if resourceInfo.nil? || resourceInfo.empty? - @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] - @@CustomProperties["ClusterType"] = @@AcsClusterType - @@CustomProperties["SubscriptionID"] = "" - @@CustomProperties["ResourceGroupName"] = "" - @@CustomProperties["ClusterName"] = "" - @@CustomProperties["Region"] = "" - else - @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo - begin - splitStrings = resourceInfo.split('/') - subscriptionId = splitStrings[2] - resourceGroupName = splitStrings[4] - clusterName = splitStrings[8] - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") - end - @@CustomProperties["ClusterType"] = @@AksClusterType - @@CustomProperties["SubscriptionID"] = subscriptionId - @@CustomProperties["ResourceGroupName"] = resourceGroupName - @@CustomProperties["ClusterName"] = clusterName - @@CustomProperties["Region"] = ENV[@@EnvAksRegion] - end + class << self + #Set default properties for telemetry event + def initializeUtility() + begin + resourceInfo = ENV["AKS_RESOURCE_ID"] + if resourceInfo.nil? || resourceInfo.empty? + @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] + @@CustomProperties["ClusterType"] = @@AcsClusterType + @@CustomProperties["SubscriptionID"] = "" + @@CustomProperties["ResourceGroupName"] = "" + @@CustomProperties["ClusterName"] = "" + @@CustomProperties["Region"] = "" + else + @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo + begin + splitStrings = resourceInfo.split("/") + subscriptionId = splitStrings[2] + resourceGroupName = splitStrings[4] + clusterName = splitStrings[8] + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") + end + @@CustomProperties["ClusterType"] = @@AksClusterType + @@CustomProperties["SubscriptionID"] = subscriptionId + @@CustomProperties["ResourceGroupName"] = resourceGroupName + @@CustomProperties["ClusterName"] = clusterName + @@CustomProperties["Region"] = ENV[@@EnvAksRegion] + end - getDockerInfo() - @@CustomProperties['WorkspaceID'] = getWorkspaceId - @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] - @@CustomProperties['ControllerType'] = ENV[@@EnvControllerType] - encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] + getDockerInfo() + @@CustomProperties["WorkspaceID"] = getWorkspaceId + @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion] + @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] + encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] - #Check if telemetry is turned off - telemetryOffSwitch = ENV['DISABLE_TELEMETRY'] - if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase - $log.warn("AppInsightsUtility: Telemetry is disabled") - @@Tc = ApplicationInsights::TelemetryClient.new - elsif !encodedAppInsightsKey.nil? - decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) - @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey - - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") - end + #Check if telemetry is turned off + telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] + if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase + $log.warn("AppInsightsUtility: Telemetry is disabled") + @@Tc = ApplicationInsights::TelemetryClient.new + elsif !encodedAppInsightsKey.nil? + decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") + end + end - def getDockerInfo() - dockerInfo = DockerApiClient.dockerInfo - if (!dockerInfo.nil? && !dockerInfo.empty?) - @@CustomProperties['DockerVersion'] = dockerInfo['Version'] - @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] - end - end + def getDockerInfo() + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + @@CustomProperties["DockerApiVersion"] = dockerInfo["ApiVersion"] + end + end - def sendHeartBeatEvent(pluginName) - begin - eventName = pluginName + @@HeartBeat - if !(@@Tc.nil?) - @@Tc.track_event eventName , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Heartbeat Telemetry sent successfully") - end - rescue =>errorStr - $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") - end + def sendHeartBeatEvent(pluginName) + begin + eventName = pluginName + @@HeartBeat + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Heartbeat Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") + end + end - def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - begin - if !(@@Tc.nil?) - @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Container Count Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") - end + def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + begin + if !(@@Tc.nil?) + @@Tc.track_metric "LastProcessedContainerInventoryCount", properties["ContainerCount"], + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Container Count Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") + end + end - def sendCustomEvent(eventName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - end - if !(@@Tc.nil?) - @@Tc.track_event eventName, :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Custom Event #{eventName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") - end + def sendCustomEvent(eventName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights Custom Event #{eventName} sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + end + end - def sendExceptionTelemetry(errorStr) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - if !(@@Tc.nil?) - @@Tc.track_exception errorStr , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Exception Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") - end + def sendExceptionTelemetry(errorStr) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + if !(@@Tc.nil?) + @@Tc.track_exception errorStr, :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Exception Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") + end + end - #Method to send heartbeat and container inventory count - def sendTelemetry(pluginName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - @@CustomProperties['Computer'] = properties['Computer'] - sendHeartBeatEvent(pluginName) - sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") - end + #Method to send heartbeat and container inventory count + def sendTelemetry(pluginName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() end + @@CustomProperties["Computer"] = properties["Computer"] + sendHeartBeatEvent(pluginName) + sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") + end + end - #Method to send metric. It will merge passed-in properties with common custom properties - def sendMetricTelemetry(metricName, metricValue, properties) - begin - if (metricName.empty? || metricName.nil?) - $log.warn("SendMetricTelemetry: metricName is missing") - return - end - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - telemetryProps = {} - telemetryProps["Computer"] = @@hostName - # add common dimensions - @@CustomProperties.each{ |k,v| telemetryProps[k]=v} - # add passed-in dimensions if any - if (!properties.nil? && !properties.empty?) - properties.each{ |k,v| telemetryProps[k]=v} - end - if !(@@Tc.nil?) - @@Tc.track_metric metricName, metricValue, - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => telemetryProps - @@Tc.flush - $log.info("AppInsights metric Telemetry #{metricName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") - end + #Method to send metric. It will merge passed-in properties with common custom properties + def sendMetricTelemetry(metricName, metricValue, properties) + begin + if (metricName.empty? || metricName.nil?) + $log.warn("SendMetricTelemetry: metricName is missing") + return end + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + telemetryProps = {} + telemetryProps["Computer"] = @@hostName + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_metric metricName, metricValue, + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights metric Telemetry #{metricName} sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") + end + end - def getWorkspaceId() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split('=') - adminConf[splitStrings[0]] = splitStrings[1] - end - workspaceId = adminConf['WORKSPACE_ID'] - return workspaceId - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") - end + def getWorkspaceId() + begin + adminConf = {} + confFile = File.open(@OmsAdminFilePath, "r") + confFile.each_line do |line| + splitStrings = line.split("=") + adminConf[splitStrings[0]] = splitStrings[1] end + workspaceId = adminConf["WORKSPACE_ID"] + return workspaceId + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") + end end -end \ No newline at end of file + end +end diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 29103eb9f..73a4f4cce 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -25,9 +25,12 @@ class CAdvisorMetricsAPIClient @@winNodeCpuUsageNanoSecondsTimeLast = {} @@winContainerCpuUsageNanoSecondsLast = {} @@winContainerCpuUsageNanoSecondsTimeLast = {} + @@winContainerPrevMetricRate = {} + @@linuxNodePrevMetricRate = nil + @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@nodeTelemetryTimeTracker = {} def initialize end @@ -202,6 +205,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] + # containerCount = 0 clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@ -215,6 +219,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if (!pod["containers"].nil?) pod["containers"].each do |container| #cpu metric + # containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = container["cpu"]["time"] @@ -235,35 +240,52 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime - return nil + # return nil + next else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time - if timeDifference != 0 - metricRateValue = ((metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId]) * 1.0) / timeDifference + containerCpuUsageDifference = metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId] + # containerCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && containerCpuUsageDifference != 0 + metricRateValue = (containerCpuUsageDifference * 1.0) / timeDifference else - metricRateValue = 0 + @Log.info "container - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winContainerPrevMetricRate[containerId].nil? + metricRateValue = @@winContainerPrevMetricRate[containerId] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime metricValue = metricRateValue + @@winContainerPrevMetricRate[containerId] = metricRateValue end metricCollections["Value"] = metricValue metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + @Log.info "metric item: #{metricItem}" end end end #Sending ContainerInventoryTelemetry from replicaset for telemetry purposes - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) - @@telemetryTimeTracker = DateTime.now.to_time.to_i - telemetryProperties = {} - telemetryProperties["Computer"] = hostname - # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. - ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) + if @@nodeTelemetryTimeTracker[hostName].nil? + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + else + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker[hostName]).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostName + # telemetryProperties["ContainerCount"] = containerCount + # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. + @Log.info "sending container inventory heartbeat telemetry" + ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) + end end rescue => error @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") @@ -420,13 +442,22 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl return nil else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time - if timeDifference != 0 - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0) / timeDifference + nodeCpuUsageDifference = metricValue - @@nodeCpuUsageNanoSecondsLast + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference else - metricRateValue = 0 + @Log.info "linux node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@linuxNodePrevMetricRate.nil? + metricRateValue = @@linuxNodePrevMetricRate + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end @@nodeCpuUsageNanoSecondsLast = metricValue @@nodeCpuUsageNanoSecondsTimeLast = metricTime + @@linuxNodePrevMetricRate = metricRateValue metricValue = metricRateValue end elsif operatingSystem == "Windows" @@ -437,13 +468,22 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl return nil else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time - if timeDifference != 0 - metricRateValue = ((metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName]) * 1.0) / timeDifference + nodeCpuUsageDifference = metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName] + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference else - metricRateValue = 0 + @Log.info "windows node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winNodePrevMetricRate[hostName].nil? + metricRateValue = @@winNodePrevMetricRate[hostName] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + @@winNodePrevMetricRate[hostName] = metricRateValue metricValue = metricRateValue end end diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3d026b05f..d40345150 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Kube_PodInventory_Input < Input - Plugin.register_input('kubepodinventory', self) + Plugin.register_input("kubepodinventory", self) - @@MDMKubePodInventoryTag = 'mdm.kubepodinventory' + @@MDMKubePodInventoryTag = "mdm.kubepodinventory" + @@hostName = (OMS::Common.get_hostname) def initialize super - require 'yaml' - require 'json' - require 'set' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + require "yaml" + require "json" + require "set" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" - def configure (conf) + def configure(conf) super end @@ -48,29 +47,29 @@ def shutdown end end - def enumerate(podList = nil) - if podList.nil? - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('pods').body) - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + def enumerate(podList = nil) + if podList.nil? + $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) + $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + else + podInventory = podList + end + begin + if (!podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) + #get pod inventory & services + $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceList) else - podInventory = podList + $log.warn "Received empty podInventory" end - begin - if(!podInventory.empty? && podInventory.key?("items") && !podInventory['items'].empty?) - #get pod inventory & services - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo('services').body) - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceList) - else - $log.warn "Received empty podInventory" - end - rescue => errorStr - $log.warn "Failed in enumerate pod inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + rescue => errorStr + $log.warn "Failed in enumerate pod inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def parse_and_emit_records(podInventory, serviceList) @@ -81,99 +80,99 @@ def parse_and_emit_records(podInventory, serviceList) controllerSet = Set.new [] telemetryFlush = false begin #begin block start - podInventory['items'].each do |items| #podInventory block start + podInventory["items"].each do |items| #podInventory block start records = [] record = {} - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Name'] = items['metadata']['name'] - podNameSpace = items['metadata']['namespace'] - - if podNameSpace.eql?("kube-system") && !items['metadata'].key?("ownerReferences") + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = items["metadata"]["name"] + podNameSpace = items["metadata"]["namespace"] + + if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") # The above case seems to be the only case where you have horizontal scaling of pods # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = items['metadata']['annotations']['kubernetes.io/config.hash'] + podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] else - podUid = items['metadata']['uid'] + podUid = items["metadata"]["uid"] end - record['PodUid'] = podUid - record['PodLabel'] = [items['metadata']['labels']] - record['Namespace'] = podNameSpace - record['PodCreationTimeStamp'] = items['metadata']['creationTimestamp'] + record["PodUid"] = podUid + record["PodLabel"] = [items["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] #for unscheduled (non-started) pods startTime does NOT exist - if !items['status']['startTime'].nil? - record['PodStartTime'] = items['status']['startTime'] + if !items["status"]["startTime"].nil? + record["PodStartTime"] = items["status"]["startTime"] else - record['PodStartTime'] = "" + record["PodStartTime"] = "" end #podStatus # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running podReadyCondition = true - if !items['status']['reason'].nil? && items['status']['reason'] == "NodeLost" && !items['status']['conditions'].nil? - items['status']['conditions'].each do |condition| - if condition['type'] == "Ready" && condition['status'] == "False" + if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? + items["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" podReadyCondition = false break end end end if podReadyCondition == false - record['PodStatus'] = "Unknown" + record["PodStatus"] = "Unknown" else - record['PodStatus'] = items['status']['phase'] + record["PodStatus"] = items["status"]["phase"] end #for unscheduled (non-started) pods podIP does NOT exist - if !items['status']['podIP'].nil? - record['PodIp'] =items['status']['podIP'] + if !items["status"]["podIP"].nil? + record["PodIp"] = items["status"]["podIP"] else - record['PodIp'] = "" + record["PodIp"] = "" end #for unscheduled (non-started) pods nodeName does NOT exist - if !items['spec']['nodeName'].nil? - record['Computer'] = items['spec']['nodeName'] + if !items["spec"]["nodeName"].nil? + record["Computer"] = items["spec"]["nodeName"] else - record['Computer'] = "" - end - record['ClusterId'] = KubernetesApiClient.getClusterId - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ServiceName'] = getServiceNameFromLabels(items['metadata']['namespace'], items['metadata']['labels'], serviceList) - # Adding telemetry to send pod telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - telemetryFlush = true - end - if !items['metadata']['ownerReferences'].nil? - record['ControllerKind'] = items['metadata']['ownerReferences'][0]['kind'] - record['ControllerName'] = items['metadata']['ownerReferences'][0]['name'] + record["Computer"] = "" + end + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end + if !items["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] if telemetryFlush == true - controllerSet.add(record['ControllerKind'] + record['ControllerName']) + controllerSet.add(record["ControllerKind"] + record["ControllerName"]) end end podRestartCount = 0 - record['PodRestartCount'] = 0 - if items['status'].key?("containerStatuses") && !items['status']['containerStatuses'].empty? #container status block start - items['status']['containerStatuses'].each do |container| - containerRestartCount = 0 - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container['containerID'].nil? - record['ContainerID'] = container['containerID'].split("//")[1] - else + record["PodRestartCount"] = 0 + if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + items["status"]["containerStatuses"].each do |container| + containerRestartCount = 0 + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record['ContainerID'] = "" + record["ContainerID"] = "" end - #keeping this as which is same as InstanceName in perf table - record['ContainerName'] = podUid + "/" +container['name'] - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container['restartCount'] - record['ContainerRestartCount'] = containerRestartCount - containerStatus = container['state'] - record['ContainerStatusReason'] = '' + #keeping this as which is same as InstanceName in perf table + record["ContainerName"] = podUid + "/" + container["name"] + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + containerStatus = container["state"] + record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { # "waiting": { @@ -183,55 +182,57 @@ def parse_and_emit_records(podInventory, serviceList) # }, # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running if podReadyCondition == false - record['ContainerStatus'] = "Unknown" + record["ContainerStatus"] = "Unknown" else - record['ContainerStatus'] = containerStatus.keys[0] + record["ContainerStatus"] = containerStatus.keys[0] end #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric #Picking up both container and node start time from cAdvisor to be consistent if containerStatus.keys[0] == "running" - record['ContainerCreationTimeStamp'] = container['state']['running']['startedAt'] + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] else - if !containerStatus[containerStatus.keys[0]]['reason'].nil? && !containerStatus[containerStatus.keys[0]]['reason'].empty? - record['ContainerStatusReason'] = containerStatus[containerStatus.keys[0]]['reason'] + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] end end - podRestartCount += containerRestartCount - records.push(record.dup) - end + podRestartCount += containerRestartCount + records.push(record.dup) + end else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) + records.push(record) end #container status block end records.each do |record| if !record.nil? - record['PodRestartCount'] = podRestartCount + record["PodRestartCount"] = podRestartCount wrapper = { - "DataType"=>"KUBE_POD_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper - end - end + end + end end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream if telemetryFlush == true - ApplicationInsightsUtility.sendHeartBeatEvent("KubePodInventory") - ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length , {}) + telemetryProperties = {} + telemetryProperties["Computer"] = @@hostName + ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - rescue => errorStr + rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end #begin block end - end + end #begin block end + end def run_periodic @mutex.lock @@ -257,37 +258,33 @@ def run_periodic def getServiceNameFromLabels(namespace, labels, serviceList) serviceName = "" begin - if !labels.nil? && !labels.empty? - if( !serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList['items'].empty?) - serviceList['items'].each do |item| + if !labels.nil? && !labels.empty? + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) + serviceList["items"].each do |item| found = 0 - if !item['spec'].nil? && !item['spec']['selector'].nil? && item['metadata']['namespace'] == namespace - selectorLabels = item['spec']['selector'] + if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace + selectorLabels = item["spec"]["selector"] if !selectorLabels.empty? - selectorLabels.each do |key,value| - if !(labels.select {|k,v| k==key && v==value}.length > 0) + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) break end found = found + 1 end - end + end if found == selectorLabels.length - return item['metadata']['name'] + return item["metadata"]["name"] end - end + end end - end + end end - rescue => errorStr + rescue => errorStr $log.warn "Failed to retrieve service name from labels: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName end - end # Kube_Pod_Input - end # module - - From e46cfc74597b455d5d3c07a826f482b25124d0b9 Mon Sep 17 00:00:00 2001 From: rashmy Date: Mon, 18 Mar 2019 11:56:49 -0700 Subject: [PATCH 15/38] cahnges for mdm metrics --- installer/conf/kube.conf | 24 ++++++++++++++++++++++ source/code/plugin/in_win_cadvisor_perf.rb | 2 ++ 2 files changed, 26 insertions(+) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index f8aa67800..b0da9f5b4 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -61,6 +61,30 @@ log_level info +#custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + log_level info + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + type out_oms log_level debug diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 48a127dde..20c675489 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -20,6 +20,7 @@ def initialize config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.api.wincadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" def configure(conf) super @@ -66,6 +67,7 @@ def enumerate() end end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") From 414cb83b7d047755c3176baed2db6afff7084171 Mon Sep 17 00:00:00 2001 From: rashmy Date: Mon, 18 Mar 2019 18:22:51 -0700 Subject: [PATCH 16/38] changes --- source/code/plugin/KubernetesApiClient.rb | 11 ++++++++ source/code/plugin/in_kube_podinventory.rb | 32 ++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 17e7adedb..a30f519ac 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -24,6 +24,7 @@ class KubernetesApiClient @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil @@NodeMetrics = Hash.new + @@WinNodeArray = [] def initialize end @@ -231,10 +232,16 @@ def getWindowsNodes # check for windows operating system in node metadata winNode = {} nodeStatus = item["status"] + nodeMetadata = item["metadata"] if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + @@WinNodeArray.push(nodeMetadata["name"]) + end nodeStatusAddresses = nodeStatus["addresses"] if !nodeStatusAddresses.nil? nodeStatusAddresses.each do |address| @@ -252,6 +259,10 @@ def getWindowsNodes end end + def getWindowsNodesArray + return @@WinNodeArray + end + def getContainerIDs(namespace) containers = Hash.new begin diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index d40345150..e96e6bdda 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -80,7 +80,11 @@ def parse_and_emit_records(podInventory, serviceList) controllerSet = Set.new [] telemetryFlush = false begin #begin block start + # Getting windows nodes from kubeapi + winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |items| #podInventory block start + containerInventoryRecords = [] + containerInventoryRecord = {} records = [] record = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated @@ -197,6 +201,33 @@ def parse_and_emit_records(podInventory, serviceList) end podRestartCount += containerRestartCount records.push(record.dup) + + #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel + if winNodes.length > 0 + if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + containerInventoryRecord["ContainerID"] = record["ContainerID"] + containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ElementName"] = container["name"] + image = container["image"] + repoInfo = image.split("/") + if !repoInfo.nil? + containerInventoryRecord["Repository"] = repoInfo[0] + if !repoInfo[1].nil? + imageInfo = repoInfo[1].split(":") + if !imageInfo.nil? + containerInventoryRecord["Image"] = imageInfo[0] + containerInventoryRecord["ImageTag"] = imageInfo[1] + end + end + end + imageIdInfo = container["imageID"] + imageIdSplitInfo = imageIdInfo.split("@") + if !imageIdSplitInfo.nil? + containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] + end + end + end end else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod records.push(record) @@ -213,6 +244,7 @@ def parse_and_emit_records(podInventory, serviceList) end end end #podInventory block end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream if telemetryFlush == true From 6926d22e6846f16a3d1c56a246742c3e722ddcaa Mon Sep 17 00:00:00 2001 From: rashmy Date: Mon, 18 Mar 2019 18:27:41 -0700 Subject: [PATCH 17/38] cahnges --- source/code/plugin/in_kube_podinventory.rb | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index e96e6bdda..7ab1db58a 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -84,7 +84,6 @@ def parse_and_emit_records(podInventory, serviceList) winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |items| #podInventory block start containerInventoryRecords = [] - containerInventoryRecord = {} records = [] record = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated @@ -205,6 +204,7 @@ def parse_and_emit_records(podInventory, serviceList) #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel if winNodes.length > 0 if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + containerInventoryRecord = {} containerInventoryRecord["ContainerID"] = record["ContainerID"] containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventoryRecord["Computer"] = record["Computer"] @@ -226,6 +226,7 @@ def parse_and_emit_records(podInventory, serviceList) if !imageIdSplitInfo.nil? containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] end + containerInventoryRecords.push(containerInventoryRecord) end end end @@ -243,6 +244,17 @@ def parse_and_emit_records(podInventory, serviceList) eventStream.add(emitTime, wrapper) if wrapper end end + # Send container inventory records for containers on windows nodes + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream From 7c29219810db85d175b904da5128fce630bad4ed Mon Sep 17 00:00:00 2001 From: rashmy Date: Mon, 18 Mar 2019 21:58:53 -0700 Subject: [PATCH 18/38] changes --- source/code/plugin/in_kube_podinventory.rb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 7ab1db58a..ca6b6558c 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -79,6 +79,7 @@ def parse_and_emit_records(podInventory, serviceList) eventStream = MultiEventStream.new controllerSet = Set.new [] telemetryFlush = false + winContainerCount = 0 begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray @@ -205,9 +206,10 @@ def parse_and_emit_records(podInventory, serviceList) if winNodes.length > 0 if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) containerInventoryRecord = {} - containerInventoryRecord["ContainerID"] = record["ContainerID"] + containerInventoryRecord["InstanceID"] = record["ContainerID"] containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ContainerHostname"] = record["Computer"] containerInventoryRecord["ElementName"] = container["name"] image = container["image"] repoInfo = image.split("/") @@ -245,6 +247,7 @@ def parse_and_emit_records(podInventory, serviceList) end end # Send container inventory records for containers on windows nodes + winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| if !cirecord.nil? ciwrapper = { @@ -265,6 +268,10 @@ def parse_and_emit_records(podInventory, serviceList) ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) + if winContainerCount > 0 + telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount + ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) + end @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end @@istestvar = ENV["ISTEST"] From 416dfee4d3c765bfaf7ca9dee2e88695e6aca495 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 19 Mar 2019 15:52:11 -0700 Subject: [PATCH 19/38] container states --- source/code/plugin/in_kube_podinventory.rb | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index ca6b6558c..0fa9869f4 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -228,6 +228,26 @@ def parse_and_emit_records(podInventory, serviceList) if !imageIdSplitInfo.nil? containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] end + # Get container state + if containerStatus.keys[0] == "running" + containerInventoryRecord["State"] = "Running" + containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] + elsif containerStatus.keys[0] == "terminated" + containerExitCode = container["state"]["terminated"]["exitCode"] + containerStartTime = container["state"]["terminated"]["startedAt"] + if containerExitCode < 0 + # Exit codes less than 0 are not supported by the engine + containerExitCode = 128 + end + if containerExitCode > 0 + containerInventoryRecord["State"] = "Failed" + containerInventoryRecord["ExitCode"] = containerExitCode + containerInventoryRecord["StartedTime"] = containerStartTime + else + containerInventoryRecord["State"] = "Stopped" + containerInventoryRecord["ExitCode"] = containerStartTime + end + end containerInventoryRecords.push(containerInventoryRecord) end end From f4f7d50e989bb80ed2c27d9281d2eeba397dcff3 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 19 Mar 2019 17:51:16 -0700 Subject: [PATCH 20/38] changes --- source/code/plugin/in_kube_podinventory.rb | 104 ++++++++++++--------- 1 file changed, 61 insertions(+), 43 deletions(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 0fa9869f4..3f41cca49 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -72,6 +72,56 @@ def enumerate(podList = nil) end end + def populateWindowsContainerInventoryRecord(container, record, batchTime) + containerInventoryRecord = {} + containerInventoryRecord["InstanceID"] = record["ContainerID"] + containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ContainerHostname"] = record["Computer"] + containerInventoryRecord["ElementName"] = container["name"] + image = container["image"] + repoInfo = image.split("/") + if !repoInfo.nil? + containerInventoryRecord["Repository"] = repoInfo[0] + if !repoInfo[1].nil? + imageInfo = repoInfo[1].split(":") + if !imageInfo.nil? + containerInventoryRecord["Image"] = imageInfo[0] + containerInventoryRecord["ImageTag"] = imageInfo[1] + end + end + end + imageIdInfo = container["imageID"] + imageIdSplitInfo = imageIdInfo.split("@") + if !imageIdSplitInfo.nil? + containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] + end + # Get container state + if containerStatus.keys[0] == "running" + containerInventoryRecord["State"] = "Running" + containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] + elsif containerStatus.keys[0] == "terminated" + containerExitCode = container["state"]["terminated"]["exitCode"] + containerStartTime = container["state"]["terminated"]["startedAt"] + containerFinishTime = container["state"]["terminated"]["finishedAt"] + if containerExitCode < 0 + # Exit codes less than 0 are not supported by the engine + containerExitCode = 128 + end + if containerExitCode > 0 + containerInventoryRecord["State"] = "Failed" + else + containerInventoryRecord["State"] = "Stopped" + end + containerInventoryRecord["ExitCode"] = containerExitCode + containerInventoryRecord["StartedTime"] = containerStartTime + containerInventoryRecord["FinishedTime"] = containerFinishTime + elsif containerStatus.keys[0] == "waiting" + containerInventoryRecord["State"] = "Waiting" + end + return containerInventoryRecord + end + def parse_and_emit_records(podInventory, serviceList) currentTime = Time.now emitTime = currentTime.to_f @@ -80,6 +130,7 @@ def parse_and_emit_records(podInventory, serviceList) controllerSet = Set.new [] telemetryFlush = false winContainerCount = 0 + sendWindowsContainerInventoryRecord = false begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray @@ -89,6 +140,15 @@ def parse_and_emit_records(podInventory, serviceList) record = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Name"] = items["metadata"]["name"] + + # Setting this flag to true so that we can send ContainerInventory records for containers + # on windows nodes and parse environment variables for these containers + if winNodes.length > 0 + if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + sendWindowsContainerInventoryRecord = true + end + end + podNameSpace = items["metadata"]["namespace"] if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") @@ -205,49 +265,7 @@ def parse_and_emit_records(podInventory, serviceList) #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel if winNodes.length > 0 if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) - containerInventoryRecord = {} - containerInventoryRecord["InstanceID"] = record["ContainerID"] - containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerInventoryRecord["Computer"] = record["Computer"] - containerInventoryRecord["ContainerHostname"] = record["Computer"] - containerInventoryRecord["ElementName"] = container["name"] - image = container["image"] - repoInfo = image.split("/") - if !repoInfo.nil? - containerInventoryRecord["Repository"] = repoInfo[0] - if !repoInfo[1].nil? - imageInfo = repoInfo[1].split(":") - if !imageInfo.nil? - containerInventoryRecord["Image"] = imageInfo[0] - containerInventoryRecord["ImageTag"] = imageInfo[1] - end - end - end - imageIdInfo = container["imageID"] - imageIdSplitInfo = imageIdInfo.split("@") - if !imageIdSplitInfo.nil? - containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] - end - # Get container state - if containerStatus.keys[0] == "running" - containerInventoryRecord["State"] = "Running" - containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] - elsif containerStatus.keys[0] == "terminated" - containerExitCode = container["state"]["terminated"]["exitCode"] - containerStartTime = container["state"]["terminated"]["startedAt"] - if containerExitCode < 0 - # Exit codes less than 0 are not supported by the engine - containerExitCode = 128 - end - if containerExitCode > 0 - containerInventoryRecord["State"] = "Failed" - containerInventoryRecord["ExitCode"] = containerExitCode - containerInventoryRecord["StartedTime"] = containerStartTime - else - containerInventoryRecord["State"] = "Stopped" - containerInventoryRecord["ExitCode"] = containerStartTime - end - end + containerInventoryRecord = populateWindowsContainerInventoryRecord(container, record, batchTime) containerInventoryRecords.push(containerInventoryRecord) end end From 5d131bb5ca0c6793cd2a8cb6c52c851da5fe2125 Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 19 Mar 2019 18:12:53 -0700 Subject: [PATCH 21/38] changes --- source/code/plugin/in_kube_podinventory.rb | 140 ++++++++++++--------- source/code/plugin/out_mdm.rb | 105 ++++++++-------- 2 files changed, 139 insertions(+), 106 deletions(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3f41cca49..0c0df8776 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -72,54 +72,80 @@ def enumerate(podList = nil) end end - def populateWindowsContainerInventoryRecord(container, record, batchTime) - containerInventoryRecord = {} - containerInventoryRecord["InstanceID"] = record["ContainerID"] - containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerInventoryRecord["Computer"] = record["Computer"] - containerInventoryRecord["ContainerHostname"] = record["Computer"] - containerInventoryRecord["ElementName"] = container["name"] - image = container["image"] - repoInfo = image.split("/") - if !repoInfo.nil? - containerInventoryRecord["Repository"] = repoInfo[0] - if !repoInfo[1].nil? - imageInfo = repoInfo[1].split(":") - if !imageInfo.nil? - containerInventoryRecord["Image"] = imageInfo[0] - containerInventoryRecord["ImageTag"] = imageInfo[1] + def populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + begin + containerInventoryRecord = {} + containerName = container["name"] + containerInventoryRecord["InstanceID"] = record["ContainerID"] + containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ContainerHostname"] = record["Computer"] + containerInventoryRecord["ElementName"] = containerName + image = container["image"] + repoInfo = image.split("/") + if !repoInfo.nil? + containerInventoryRecord["Repository"] = repoInfo[0] + if !repoInfo[1].nil? + imageInfo = repoInfo[1].split(":") + if !imageInfo.nil? + containerInventoryRecord["Image"] = imageInfo[0] + containerInventoryRecord["ImageTag"] = imageInfo[1] + end end end - end - imageIdInfo = container["imageID"] - imageIdSplitInfo = imageIdInfo.split("@") - if !imageIdSplitInfo.nil? - containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] - end - # Get container state - if containerStatus.keys[0] == "running" - containerInventoryRecord["State"] = "Running" - containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] - elsif containerStatus.keys[0] == "terminated" - containerExitCode = container["state"]["terminated"]["exitCode"] - containerStartTime = container["state"]["terminated"]["startedAt"] - containerFinishTime = container["state"]["terminated"]["finishedAt"] - if containerExitCode < 0 - # Exit codes less than 0 are not supported by the engine - containerExitCode = 128 + imageIdInfo = container["imageID"] + imageIdSplitInfo = imageIdInfo.split("@") + if !imageIdSplitInfo.nil? + containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] end - if containerExitCode > 0 - containerInventoryRecord["State"] = "Failed" - else - containerInventoryRecord["State"] = "Stopped" + # Get container state + containerStatus = container["state"] + if containerStatus.keys[0] == "running" + containerInventoryRecord["State"] = "Running" + containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] + elsif containerStatus.keys[0] == "terminated" + containerExitCode = container["state"]["terminated"]["exitCode"] + containerStartTime = container["state"]["terminated"]["startedAt"] + containerFinishTime = container["state"]["terminated"]["finishedAt"] + if containerExitCode < 0 + # Exit codes less than 0 are not supported by the engine + containerExitCode = 128 + end + if containerExitCode > 0 + containerInventoryRecord["State"] = "Failed" + else + containerInventoryRecord["State"] = "Stopped" + end + containerInventoryRecord["ExitCode"] = containerExitCode + containerInventoryRecord["StartedTime"] = containerStartTime + containerInventoryRecord["FinishedTime"] = containerFinishTime + elsif containerStatus.keys[0] == "waiting" + containerInventoryRecord["State"] = "Waiting" + end + containerInventoryRecord["EnvironmentVar"] = containerEnvVariableHash[containerName] + return containerInventoryRecord + rescue => errorStr + $log.warn "Failed in populateWindowsContainerInventoryRecord: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def getContainerEnvironmentVariables(pod) + begin + podSpec = pod["spec"] + containerEnvHash = {} + if !podSpec.nil? && !podSpec["containers"].nil? + podSpec["containers"].each do |container| + containerEnvHash[container["name"]] = container["env"].to_s + end end - containerInventoryRecord["ExitCode"] = containerExitCode - containerInventoryRecord["StartedTime"] = containerStartTime - containerInventoryRecord["FinishedTime"] = containerFinishTime - elsif containerStatus.keys[0] == "waiting" - containerInventoryRecord["State"] = "Waiting" + return containerEnvHash + rescue => errorStr + $log.warn "Failed in getContainerEnvironmentVariables: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - return containerInventoryRecord end def parse_and_emit_records(podInventory, serviceList) @@ -140,15 +166,6 @@ def parse_and_emit_records(podInventory, serviceList) record = {} record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Name"] = items["metadata"]["name"] - - # Setting this flag to true so that we can send ContainerInventory records for containers - # on windows nodes and parse environment variables for these containers - if winNodes.length > 0 - if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) - sendWindowsContainerInventoryRecord = true - end - end - podNameSpace = items["metadata"]["namespace"] if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") @@ -199,6 +216,16 @@ def parse_and_emit_records(podInventory, serviceList) else record["Computer"] = "" end + + # Setting this flag to true so that we can send ContainerInventory records for containers + # on windows nodes and parse environment variables for these containers + if winNodes.length > 0 + if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + sendWindowsContainerInventoryRecord = true + containerEnvVariableHash = getContainerEnvironmentVariables(items) + end + end + record["ClusterId"] = KubernetesApiClient.getClusterId record["ClusterName"] = KubernetesApiClient.getClusterName record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) @@ -263,11 +290,9 @@ def parse_and_emit_records(podInventory, serviceList) records.push(record.dup) #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - if winNodes.length > 0 - if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) - containerInventoryRecord = populateWindowsContainerInventoryRecord(container, record, batchTime) - containerInventoryRecords.push(containerInventoryRecord) - end + if sendWindowsContainerInventoryRecord == true + containerInventoryRecord = populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + containerInventoryRecords.push(containerInventoryRecord) end end else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod @@ -286,6 +311,7 @@ def parse_and_emit_records(podInventory, serviceList) end # Send container inventory records for containers on windows nodes winContainerCount += containerInventoryRecords.length + $log.info "cirecords: #{containerInventoryRecords}" containerInventoryRecords.each do |cirecord| if !cirecord.nil? ciwrapper = { diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 93b32ef50..ed18e9381 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -2,29 +2,27 @@ # frozen_string_literal: true module Fluent - class OutputMDM < BufferedOutput - config_param :retry_mdm_post_wait_minutes, :integer - Plugin.register_output('out_mdm', self) + Plugin.register_output("out_mdm", self) def initialize super - require 'net/http' - require 'net/https' - require 'uri' - require 'json' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' + require "net/http" + require "net/https" + require "uri" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" - @@token_resource_url = 'https://monitoring.azure.com/' - @@grant_type = 'client_credentials' - @@azure_json_path = '/etc/kubernetes/host/azure.json' + @@token_resource_url = "https://monitoring.azure.com/" + @@grant_type = "client_credentials" + @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" @@plugin_name = "AKSCustomMetricsMDM" - + @data_hash = {} @token_url = nil @http_client = nil @@ -50,12 +48,15 @@ def start @can_send_data_to_mdm = false return end - # Handle the case where the file read fails. Send Telemetry and exit the plugin? + # Handle the case where the file read fails. Send Telemetry and exit the plugin? @data_hash = JSON.parse(file) - @token_url = @@token_url_template % {tenant_id: @data_hash['tenantId']} + @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} @cached_access_token = get_access_token - aks_resource_id = ENV['AKS_RESOURCE_ID'] - aks_region = ENV['AKS_REGION'] + # aks_resource_id = ENV['AKS_RESOURCE_ID'] + aks_resource_id = "/subscriptions/692aea0b-2d89-4e7e-ae30-fffe40782ee2/resourceGroups/rashmi-int-workspace/providers/Microsoft.ContainerService/managedClusters/rashmi-int-workspace" + # aks_region = ENV['AKS_REGION'] + aks_region = "eastus" + if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @can_send_data_to_mdm = false @@ -77,35 +78,44 @@ def start # get the access token only if the time to expiry is less than 5 minutes def get_access_token - if @cached_access_token.to_s.empty? || (Time.now + 5*60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration + if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration @log.info "Refreshing access token for out_mdm plugin.." token_uri = URI.parse(@token_url) http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) http_access_token.use_ssl = true token_request = Net::HTTP::Post.new(token_uri.request_uri) + # token_request.set_form_data( + # { + # "grant_type" => @@grant_type, + # "client_id" => @data_hash["aadClientId"], + # "client_secret" => @data_hash["aadClientSecret"], + # "resource" => @@token_resource_url, + # } + # ) + token_request.set_form_data( { - 'grant_type' => @@grant_type, - 'client_id' => @data_hash['aadClientId'], - 'client_secret' => @data_hash['aadClientSecret'], - 'resource' => @@token_resource_url - } + "grant_type" => @@grant_type, + "client_id" => "c77c0339-d374-40db-b163-b4574c9a4814", + "client_secret" => "be8d3b1d-f787-45dd-91e7-93576f091233", + "resource" => @@token_resource_url, + } ) - + token_response = http_access_token.request(token_request) - # Handle the case where the response is not 200 + # Handle the case where the response is not 200 parsed_json = JSON.parse(token_response.body) - @token_expiry_time = Time.now + 59*60 # set the expiry time to be ~one hour from current time - @cached_access_token = parsed_json['access_token'] + @token_expiry_time = Time.now + 59 * 60 # set the expiry time to be ~one hour from current time + @cached_access_token = parsed_json["access_token"] end @cached_access_token - end + end def write_status_file(success, message) - fn = '/var/opt/microsoft/omsagent/log/MDMIngestion.status' + fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status" status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] begin - File.open(fn,'w') { |file| file.write(status) } + File.open(fn, "w") { |file| file.write(status) } rescue => e @log.debug "Error:'#{e}'" ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @@ -123,13 +133,13 @@ def format(tag, time, record) end end - # This method is called every flush interval. Send the buffer chunk to MDM. + # This method is called every flush interval. Send the buffer chunk to MDM. # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin - if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60)) && @can_send_data_to_mdm + if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } send_to_mdm post_body @@ -137,7 +147,7 @@ def write(chunk) if !@can_send_data_to_mdm @log.info "Cannot send data to MDM since all required conditions were not met" else - @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time) / 60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" end end rescue Exception => e @@ -146,12 +156,12 @@ def write(chunk) end end - def send_to_mdm(post_body) + def send_to_mdm(post_body) begin access_token = get_access_token request = Net::HTTP::Post.new(@post_request_uri.request_uri) - request['Content-Type'] = "application/x-ndjson" - request['Authorization'] = "Bearer #{access_token}" + request["Content-Type"] = "application/x-ndjson" + request["Authorization"] = "Bearer #{access_token}" request.body = post_body.join("\n") response = @http_client.request(request) response.value # this throws for non 200 HTTP response code @@ -166,10 +176,10 @@ def send_to_mdm(post_body) @first_post_attempt_made = true ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen - elsif !response.code.empty? && response.code.start_with?('4') + elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" - else + else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e @@ -186,7 +196,8 @@ def send_to_mdm(post_body) raise e end end - private + + private class ChunkErrorHandler include Configurable @@ -218,20 +229,20 @@ def router=(r) end def write(chunk) - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| @error_handlers[tag].emit(record) } end - - private + + private def create_error_handlers(router) nop_handler = NopErrorHandler.new Hash.new() { |hash, tag| etag = OMS::Common.create_error_tag tag hash[tag] = router.match?(etag) ? - ErrorHandler.new(router, etag) : - nop_handler + ErrorHandler.new(router, etag) : + nop_handler } end @@ -251,10 +262,6 @@ def emit(record) # NOP end end - end - end # class OutputMDM - end # module Fluent - From f3e6c8e9f617fcda3368528f9df882216e4622ba Mon Sep 17 00:00:00 2001 From: rashmy Date: Tue, 19 Mar 2019 22:02:37 -0700 Subject: [PATCH 22/38] changes for env variables --- source/code/plugin/in_kube_podinventory.rb | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 0c0df8776..2b262aa55 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -122,7 +122,9 @@ def populateWindowsContainerInventoryRecord(container, record, containerEnvVaria elsif containerStatus.keys[0] == "waiting" containerInventoryRecord["State"] = "Waiting" end - containerInventoryRecord["EnvironmentVar"] = containerEnvVariableHash[containerName] + if !containerEnvVariableHash.nil? && !containerEnvVariableHash.empty? + containerInventoryRecord["EnvironmentVar"] = containerEnvVariableHash[containerName] + end return containerInventoryRecord rescue => errorStr $log.warn "Failed in populateWindowsContainerInventoryRecord: #{errorStr}" @@ -137,7 +139,21 @@ def getContainerEnvironmentVariables(pod) containerEnvHash = {} if !podSpec.nil? && !podSpec["containers"].nil? podSpec["containers"].each do |container| - containerEnvHash[container["name"]] = container["env"].to_s + envVarsArray = [] + containerEnvArray = container["env"] + # Parsing the environment variable array of hashes to a string value + # since that is format being sent by container inventory workflow in daemonset + # Keeping it in the same format because the workflow expects it in this format + # and the UX expects an array of string for environment variables + if !containerEnvArray.nil? && !containerEnvArray.empty? + containerEnvArray.each do |envVarHash| + envName = envVarHash["name"] + envValue = envVarHash["value"] + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end + end + containerEnvHash[container["name"]] = envVarsArray.to_s end end return containerEnvHash @@ -156,11 +172,11 @@ def parse_and_emit_records(podInventory, serviceList) controllerSet = Set.new [] telemetryFlush = false winContainerCount = 0 - sendWindowsContainerInventoryRecord = false begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray podInventory["items"].each do |items| #podInventory block start + sendWindowsContainerInventoryRecord = false containerInventoryRecords = [] records = [] record = {} @@ -311,7 +327,6 @@ def parse_and_emit_records(podInventory, serviceList) end # Send container inventory records for containers on windows nodes winContainerCount += containerInventoryRecords.length - $log.info "cirecords: #{containerInventoryRecords}" containerInventoryRecords.each do |cirecord| if !cirecord.nil? ciwrapper = { From 90f2641356ade7fa780334fe39d558244c552408 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 20 Mar 2019 12:49:29 -0700 Subject: [PATCH 23/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 21 +++++++++++++++++++ source/code/plugin/in_win_cadvisor_perf.rb | 10 +++++++++ 2 files changed, 31 insertions(+) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 73a4f4cce..28d57f935 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -30,8 +30,14 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + + #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} + # Keeping track of containers so that can delete the container from the container cpu cache when the container is deleted + # as a part of the cleanup routine + @@winContainerIdCache = [] + def initialize end @@ -202,6 +208,10 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met return metricItems end + def clearDeletedWinContainers() + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] @@ -237,6 +247,9 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricCollections["CounterName"] = metricNametoReturn containerId = podUid + "/" + containerName + # Adding the containers to the winContainerIdCache so that it can be used by the cleanup routine + # to clear the delted containers every 5 minutes + @@winContainerIdCache.push(containerId) if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime @@ -267,6 +280,14 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + + # clean up routine to clear the deleted containers from the cache + timeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + clearDeletedWinContainers + end + @Log.info "metric item: #{metricItem}" end end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 20c675489..2d89e7117 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -33,6 +33,7 @@ def start @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i end end @@ -68,6 +69,15 @@ def enumerate() end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream + + cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 + if (cleanupTimeDifferenceInMinutes >= 10) + $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" + + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") From ca80a796af5c4748dbb7b1600af9cfac83637a89 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 20 Mar 2019 14:04:10 -0700 Subject: [PATCH 24/38] changes --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 12 ++++++++++-- source/code/plugin/in_win_cadvisor_perf.rb | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 28d57f935..55859d14c 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -208,8 +208,16 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met return metricItems end - def clearDeletedWinContainers() - @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + def clearDeletedWinContainersFromCache() + winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys + winCpuUsageNanoSecondsTimeKeys = @@winContainerCpuUsageNanoSecondsTimeLast.keys + + # Find the container ids to be deleted from cache + winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + end + + def resetWinContainerIdCache + return @@winContainerIdCache = [] end # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 2d89e7117..ea96ae5eb 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -54,6 +54,9 @@ def enumerate() $log.info "in_win_cadvisor_perf : Getting windows nodes" timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 + + #Resetting this cache so that it is populated with the current set of containers with every call + CAdvisorMetricsAPIClient.resetWinContainerIdCache() if (timeDifferenceInMinutes >= 5) @@winNodes = KubernetesApiClient.getWindowsNodes() $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" @@ -70,11 +73,12 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream + # Cleanup routine to clear deleted containers from cache cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 if (cleanupTimeDifferenceInMinutes >= 10) $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" - + CAdvisorMetricsAPIClient.clearDeletedWinContainersFromCache() @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i end From 20346c2be95776ec198744e2021c846dfaa7ac8f Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 20 Mar 2019 14:06:28 -0700 Subject: [PATCH 25/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 55859d14c..40d01cfe4 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -209,11 +209,20 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met end def clearDeletedWinContainersFromCache() - winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys - winCpuUsageNanoSecondsTimeKeys = @@winContainerCpuUsageNanoSecondsTimeLast.keys - - # Find the container ids to be deleted from cache - winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + begin + winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys + winCpuUsageNanoSecondsTimeKeys = @@winContainerCpuUsageNanoSecondsTimeLast.keys + + # Find the container ids to be deleted from cache + winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + winContainersToBeCleared.each do |containerId| + @@winContainerCpuUsageNanoSecondsLast.delete(containerId) + @@winContainerCpuUsageNanoSecondsTimeLast.delete(containerId) + end + rescue => errorStr + @Log.warn("clearDeletedWinContainersFromCache failed: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def resetWinContainerIdCache From 5b67e56eb448fb4313a6474bf05e458b12b269b8 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 20 Mar 2019 15:28:38 -0700 Subject: [PATCH 26/38] changes --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 27 ++++++++++--------- source/code/plugin/in_win_cadvisor_perf.rb | 18 ++++++------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 40d01cfe4..cc25d3d93 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -215,6 +215,9 @@ def clearDeletedWinContainersFromCache() # Find the container ids to be deleted from cache winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + if winContainersToBeCleared.length > 0 + @Log.warn "Stale containers found in cache, clearing...: #{winContainersToBeCleared}" + end winContainersToBeCleared.each do |containerId| @@winContainerCpuUsageNanoSecondsLast.delete(containerId) @@winContainerCpuUsageNanoSecondsTimeLast.delete(containerId) @@ -226,7 +229,7 @@ def clearDeletedWinContainersFromCache() end def resetWinContainerIdCache - return @@winContainerIdCache = [] + @@winContainerIdCache = [] end # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds @@ -236,6 +239,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 + @Log.warn "in host: #{hostName}" begin metricInfo = metricJSON metricInfo["pods"].each do |pod| @@ -297,15 +301,14 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - - # clean up routine to clear the deleted containers from the cache - timeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) - clearDeletedWinContainers - end - - @Log.info "metric item: #{metricItem}" + # # clean up routine to clear the deleted containers from the cache + # timeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + # timeDifferenceInMinutes = timeDifference / 60 + # if (timeDifferenceInMinutes >= 5) + # clearDeletedWinContainers + # end + + # @Log.info "metric item: #{metricItem}" end end end @@ -366,7 +369,7 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - @Log.info "#{metricItem}" + # @Log.info "#{metricItem}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -541,7 +544,7 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) - @Log.info "#{metricItem}" + # @Log.info "#{metricItem}" end rescue => error @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index ea96ae5eb..a4945b33d 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -73,20 +73,20 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - # Cleanup routine to clear deleted containers from cache - cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs - cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 - if (cleanupTimeDifferenceInMinutes >= 10) - $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" - CAdvisorMetricsAPIClient.clearDeletedWinContainersFromCache() - @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i - end - @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end end + + # Cleanup routine to clear deleted containers from cache + cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 + if (cleanupTimeDifferenceInMinutes >= 5) + $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" + CAdvisorMetricsAPIClient.clearDeletedWinContainersFromCache() + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data for windows nodes: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) From 85e8b9a399074861c0dd4f491ce281fd2ae0c915 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 20 Mar 2019 17:09:17 -0700 Subject: [PATCH 27/38] delete comments --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index cc25d3d93..7f726bf2c 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -301,14 +301,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - # # clean up routine to clear the deleted containers from the cache - # timeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs - # timeDifferenceInMinutes = timeDifference / 60 - # if (timeDifferenceInMinutes >= 5) - # clearDeletedWinContainers - # end - - # @Log.info "metric item: #{metricItem}" end end end @@ -322,7 +314,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i telemetryProperties = {} telemetryProperties["Computer"] = hostName - # telemetryProperties["ContainerCount"] = containerCount # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. @Log.info "sending container inventory heartbeat telemetry" ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) @@ -369,7 +360,6 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - # @Log.info "#{metricItem}" #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -544,7 +534,6 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) - # @Log.info "#{metricItem}" end rescue => error @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") From 6b0b146b57bc83871bf521e405bca1d30cfd7ab0 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 21 Mar 2019 15:50:23 -0700 Subject: [PATCH 28/38] changes --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 7f726bf2c..df57f03ba 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -250,7 +250,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if (!pod["containers"].nil?) pod["containers"].each do |container| #cpu metric - # containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = container["cpu"]["time"] @@ -274,7 +273,6 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime - # return nil next else timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time From 60ee4143a74fc67df52c8bc5c0262d143e63a4b5 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 21 Mar 2019 17:52:54 -0700 Subject: [PATCH 29/38] mutex changes --- source/code/plugin/KubernetesApiClient.rb | 58 ++++++++++++++-------- source/code/plugin/in_kube_podinventory.rb | 2 + source/code/plugin/in_win_cadvisor_perf.rb | 3 +- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index a30f519ac..02e6c58fe 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -25,6 +25,8 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@mutex = Mutex.new + @@cv = ConditionVariable.new def initialize end @@ -227,32 +229,39 @@ def getWindowsNodes begin nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" - if (!nodeInventory.empty?) - nodeInventory["items"].each do |item| - # check for windows operating system in node metadata - winNode = {} - nodeStatus = item["status"] - nodeMetadata = item["metadata"] - if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? - @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" - operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] - if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) - # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data - # to get images and image tags for containers in windows nodes - if !nodeMetadata.nil? && !nodeMetadata["name"].nil? - @@WinNodeArray.push(nodeMetadata["name"]) - end - nodeStatusAddresses = nodeStatus["addresses"] - if !nodeStatusAddresses.nil? - nodeStatusAddresses.each do |address| - winNode[address["type"]] = address["address"] + @Log.warn "locking for population" + @@mutex.synchronize { + # Resetting the windows node cache + @@WinNodeArray.clear + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + winNode = {} + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + @@WinNodeArray.push(nodeMetadata["name"]) + end + nodeStatusAddresses = nodeStatus["addresses"] + if !nodeStatusAddresses.nil? + nodeStatusAddresses.each do |address| + winNode[address["type"]] = address["address"] + end + winNodes.push(winNode) end - winNodes.push(winNode) end end end end - end + @Log.warn "releasing lock" + @@cv.signal + } return winNodes rescue => error @Log.warn("Error in get windows nodes: #{error}") @@ -260,7 +269,12 @@ def getWindowsNodes end def getWindowsNodesArray - return @@WinNodeArray + @@mutex.synchronize { + @Log.warn "waiting for signal" + @@cv.wait(@@mutex) + @Log.warn "got signal: proceeding" + return @@WinNodeArray + } end def getContainerIDs(namespace) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 2b262aa55..2e9170578 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -174,7 +174,9 @@ def parse_and_emit_records(podInventory, serviceList) winContainerCount = 0 begin #begin block start # Getting windows nodes from kubeapi + $log.warn "kubepodinventory: getting windows nodes" winNodes = KubernetesApiClient.getWindowsNodesArray + $log.warn "windows nodes: #{winNodes}" podInventory["items"].each do |items| #podInventory block start sendWindowsContainerInventoryRecord = false containerInventoryRecords = [] diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index a4945b33d..9008f209f 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -57,7 +57,8 @@ def enumerate() #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() - if (timeDifferenceInMinutes >= 5) + if (timeDifferenceInMinutes >= 2) + $log.info "in_win_cadvisor_perf: getting windows nodes" @@winNodes = KubernetesApiClient.getWindowsNodes() $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" end From ca71c970c802f5f1e4a6db66c75d4ecd3a628a39 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 21 Mar 2019 21:24:57 -0700 Subject: [PATCH 30/38] changes --- source/code/plugin/KubernetesApiClient.rb | 60 +++++++++------------- source/code/plugin/in_kube_podinventory.rb | 3 +- source/code/plugin/in_win_cadvisor_perf.rb | 2 +- 3 files changed, 26 insertions(+), 39 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 02e6c58fe..94a8254e5 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -25,8 +25,6 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] - @@mutex = Mutex.new - @@cv = ConditionVariable.new def initialize end @@ -229,39 +227,34 @@ def getWindowsNodes begin nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" - @Log.warn "locking for population" - @@mutex.synchronize { - # Resetting the windows node cache - @@WinNodeArray.clear - if (!nodeInventory.empty?) - nodeInventory["items"].each do |item| - # check for windows operating system in node metadata - winNode = {} - nodeStatus = item["status"] - nodeMetadata = item["metadata"] - if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? - @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" - operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] - if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) - # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data - # to get images and image tags for containers in windows nodes - if !nodeMetadata.nil? && !nodeMetadata["name"].nil? - @@WinNodeArray.push(nodeMetadata["name"]) - end - nodeStatusAddresses = nodeStatus["addresses"] - if !nodeStatusAddresses.nil? - nodeStatusAddresses.each do |address| - winNode[address["type"]] = address["address"] - end - winNodes.push(winNode) + # Resetting the windows node cache + @@WinNodeArray.clear + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + winNode = {} + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + @@WinNodeArray.push(nodeMetadata["name"]) + end + nodeStatusAddresses = nodeStatus["addresses"] + if !nodeStatusAddresses.nil? + nodeStatusAddresses.each do |address| + winNode[address["type"]] = address["address"] end + winNodes.push(winNode) end end end end - @Log.warn "releasing lock" - @@cv.signal - } + end return winNodes rescue => error @Log.warn("Error in get windows nodes: #{error}") @@ -269,12 +262,7 @@ def getWindowsNodes end def getWindowsNodesArray - @@mutex.synchronize { - @Log.warn "waiting for signal" - @@cv.wait(@@mutex) - @Log.warn "got signal: proceeding" - return @@WinNodeArray - } + return @@WinNodeArray end def getContainerIDs(namespace) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 2e9170578..b904bb14d 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -174,9 +174,8 @@ def parse_and_emit_records(podInventory, serviceList) winContainerCount = 0 begin #begin block start # Getting windows nodes from kubeapi - $log.warn "kubepodinventory: getting windows nodes" winNodes = KubernetesApiClient.getWindowsNodesArray - $log.warn "windows nodes: #{winNodes}" + podInventory["items"].each do |items| #podInventory block start sendWindowsContainerInventoryRecord = false containerInventoryRecords = [] diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 9008f209f..3c97906b3 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -57,7 +57,7 @@ def enumerate() #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() - if (timeDifferenceInMinutes >= 2) + if (timeDifferenceInMinutes >= 5) $log.info "in_win_cadvisor_perf: getting windows nodes" @@winNodes = KubernetesApiClient.getWindowsNodes() $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" From 4a9c034951bbe8177c953cbaaef03d9749f27a34 Mon Sep 17 00:00:00 2001 From: rashmy Date: Fri, 22 Mar 2019 15:09:30 -0700 Subject: [PATCH 31/38] changes --- source/code/plugin/KubernetesApiClient.rb | 1 - source/code/plugin/in_win_cadvisor_perf.rb | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 94a8254e5..9f859529d 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -236,7 +236,6 @@ def getWindowsNodes nodeStatus = item["status"] nodeMetadata = item["metadata"] if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? - @Log.info "KubernetesAPIClient::getWindowsNodes : Iterating through nodes to get windows nodes" operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 3c97906b3..b30f5352d 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -51,16 +51,16 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new - $log.info "in_win_cadvisor_perf : Getting windows nodes" timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() if (timeDifferenceInMinutes >= 5) - $log.info "in_win_cadvisor_perf: getting windows nodes" + $log.info "in_win_cadvisor_perf: Getting windows nodes" @@winNodes = KubernetesApiClient.getWindowsNodes() $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) From 4decfc62fc906e29574fa7a405a8ae30d605e272 Mon Sep 17 00:00:00 2001 From: rashmy Date: Sun, 24 Mar 2019 16:37:54 -0700 Subject: [PATCH 32/38] changes --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 4 +++- source/code/plugin/out_mdm.rb | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index df57f03ba..198a4119c 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -96,7 +96,9 @@ def getMetrics(winNode = nil) hostName = (OMS::Common.get_hostname) operatingSystem = "Linux" end - metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) + if !getSummaryStatsFromCAdvisor(winNode).nil? + metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) + end #@Log.info "metric info: #{metricInfo}" if !metricInfo.nil? metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index ed18e9381..8958c701d 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -52,10 +52,8 @@ def start @data_hash = JSON.parse(file) @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} @cached_access_token = get_access_token - # aks_resource_id = ENV['AKS_RESOURCE_ID'] - aks_resource_id = "/subscriptions/692aea0b-2d89-4e7e-ae30-fffe40782ee2/resourceGroups/rashmi-int-workspace/providers/Microsoft.ContainerService/managedClusters/rashmi-int-workspace" - # aks_region = ENV['AKS_REGION'] - aks_region = "eastus" + aks_resource_id = ENV['AKS_RESOURCE_ID'] + aks_region = ENV['AKS_REGION'] if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " From 8fe457d33528a3806ca76305e4629d070e8f085c Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 27 Mar 2019 11:51:23 -0700 Subject: [PATCH 33/38] telemetry fix for docker version --- source/code/plugin/ApplicationInsightsUtility.rb | 4 ++-- source/code/plugin/CAdvisorMetricsAPIClient.rb | 7 +++++-- source/code/plugin/in_kube_nodes.rb | 5 +++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index d0422487b..81033ab51 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -56,7 +56,8 @@ def initializeUtility() @@CustomProperties["Region"] = ENV[@@EnvAksRegion] end - getDockerInfo() + #Commenting it for now from initilize method, we need to pivot all telemetry off of kubenode docker version + #getDockerInfo() @@CustomProperties["WorkspaceID"] = getWorkspaceId @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion] @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] @@ -179,7 +180,6 @@ def sendMetricTelemetry(metricName, metricValue, properties) getDockerInfo() end telemetryProps = {} - telemetryProps["Computer"] = @@hostName # add common dimensions @@CustomProperties.each { |k, v| telemetryProps[k] = v } # add passed-in dimensions if any diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 198a4119c..7b8fa40be 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -96,8 +96,9 @@ def getMetrics(winNode = nil) hostName = (OMS::Common.get_hostname) operatingSystem = "Linux" end - if !getSummaryStatsFromCAdvisor(winNode).nil? - metricInfo = JSON.parse(getSummaryStatsFromCAdvisor(winNode).body) + winNodeCAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !winNodeCAdvisorStats.nil? + metricInfo = JSON.parse(winNodeCAdvisorStats.body) end #@Log.info "metric info: #{metricInfo}" if !metricInfo.nil? @@ -190,6 +191,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps = {} telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end @@ -369,6 +371,7 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec telemetryProps = {} telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 556d51821..dbfb7c168 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -128,6 +128,11 @@ def enumerate properties["Computer"] = record["Computer"] properties["KubeletVersion"] = record["KubeletVersion"] properties["OperatingSystem"] = nodeInfo["operatingSystem"] + #containerRuntimeVersion is of the form + #docker://18.9.0 + if !nodeInfo["containerRuntimeVersion"].nil? && !nodeInfo["containerRuntimeVersion"].split("//").nil? + properties["DockerVersion"] = nodeInfo["containerRuntimeVersion"].split("//")[1] + end capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) From 6e38a78c728788312905e1d632ff15f8f4c7a492 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 27 Mar 2019 11:59:39 -0700 Subject: [PATCH 34/38] removing hardcoded values for mdm --- source/code/plugin/out_mdm.rb | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 8958c701d..963069858 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -52,8 +52,8 @@ def start @data_hash = JSON.parse(file) @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} @cached_access_token = get_access_token - aks_resource_id = ENV['AKS_RESOURCE_ID'] - aks_region = ENV['AKS_REGION'] + aks_resource_id = ENV["AKS_RESOURCE_ID"] + aks_region = ENV["AKS_REGION"] if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @@ -82,20 +82,11 @@ def get_access_token http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) http_access_token.use_ssl = true token_request = Net::HTTP::Post.new(token_uri.request_uri) - # token_request.set_form_data( - # { - # "grant_type" => @@grant_type, - # "client_id" => @data_hash["aadClientId"], - # "client_secret" => @data_hash["aadClientSecret"], - # "resource" => @@token_resource_url, - # } - # ) - token_request.set_form_data( { "grant_type" => @@grant_type, - "client_id" => "c77c0339-d374-40db-b163-b4574c9a4814", - "client_secret" => "be8d3b1d-f787-45dd-91e7-93576f091233", + "client_id" => @data_hash["aadClientId"], + "client_secret" => @data_hash["aadClientSecret"], "resource" => @@token_resource_url, } ) From 9d07a505ed3cb183ed1c12ef3cce2d6d29102980 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 27 Mar 2019 14:03:48 -0700 Subject: [PATCH 35/38] update docker version --- source/code/plugin/ApplicationInsightsUtility.rb | 2 +- source/code/plugin/in_kube_nodes.rb | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 81033ab51..4c3634d64 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -81,7 +81,7 @@ def getDockerInfo() dockerInfo = DockerApiClient.dockerInfo if (!dockerInfo.nil? && !dockerInfo.empty?) @@CustomProperties["DockerVersion"] = dockerInfo["Version"] - @@CustomProperties["DockerApiVersion"] = dockerInfo["ApiVersion"] + #@@CustomProperties["DockerApiVersion"] = dockerInfo["ApiVersion"] end end diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index dbfb7c168..aabda441e 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -128,11 +128,7 @@ def enumerate properties["Computer"] = record["Computer"] properties["KubeletVersion"] = record["KubeletVersion"] properties["OperatingSystem"] = nodeInfo["operatingSystem"] - #containerRuntimeVersion is of the form - #docker://18.9.0 - if !nodeInfo["containerRuntimeVersion"].nil? && !nodeInfo["containerRuntimeVersion"].split("//").nil? - properties["DockerVersion"] = nodeInfo["containerRuntimeVersion"].split("//")[1] - end + properties["DockerVersion"] = dockerVersion capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) From 9a0fc1f7320de8e93f9d58b1b4329c2263d61d8d Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 28 Mar 2019 13:45:33 -0700 Subject: [PATCH 36/38] telemetry for windows cadvisor timeouts --- source/code/plugin/ApplicationInsightsUtility.rb | 11 +++++++++-- source/code/plugin/CAdvisorMetricsAPIClient.rb | 13 ++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 4c3634d64..5dc2bfab8 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -134,15 +134,22 @@ def sendCustomEvent(eventName, properties) end end - def sendExceptionTelemetry(errorStr) + def sendExceptionTelemetry(errorStr, properties = nil) begin if @@CustomProperties.empty? || @@CustomProperties.nil? initializeUtility() elsif @@CustomProperties["DockerVersion"].nil? getDockerInfo() end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end if !(@@Tc.nil?) - @@Tc.track_exception errorStr, :properties => @@CustomProperties + @@Tc.track_exception errorStr, :properties => telemetryProps @@Tc.flush $log.info("AppInsights Exception Telemetry sent successfully") end diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 7b8fa40be..ab223e5fe 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -59,6 +59,9 @@ def getSummaryStatsFromCAdvisor(winNode) end rescue => error @Log.warn("CAdvisor api request failed: #{error}") + telemetryProps = {} + telemetryProps["ErrorHostName"] = winNode["Hostname"] + ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) end return response end @@ -96,14 +99,12 @@ def getMetrics(winNode = nil) hostName = (OMS::Common.get_hostname) operatingSystem = "Linux" end - winNodeCAdvisorStats = getSummaryStatsFromCAdvisor(winNode) - if !winNodeCAdvisorStats.nil? - metricInfo = JSON.parse(winNodeCAdvisorStats.body) + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) end - #@Log.info "metric info: #{metricInfo}" if !metricInfo.nil? metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - # metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) if operatingSystem == "Linux" @@ -122,7 +123,6 @@ def getMetrics(winNode = nil) metricDataItems.push(cpuUsageNanoSecondsRate) end metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - # metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) @@ -239,7 +239,6 @@ def resetWinContainerIdCache # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] - # containerCount = 0 clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 From 74548994cdf00164497dfdfd61db77c3b691b9e3 Mon Sep 17 00:00:00 2001 From: rashmy Date: Thu, 28 Mar 2019 14:00:40 -0700 Subject: [PATCH 37/38] exeception key update to computer --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index ab223e5fe..f986a3cfd 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -60,7 +60,7 @@ def getSummaryStatsFromCAdvisor(winNode) rescue => error @Log.warn("CAdvisor api request failed: #{error}") telemetryProps = {} - telemetryProps["ErrorHostName"] = winNode["Hostname"] + telemetryProps["Computer"] = winNode["Hostname"] ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) end return response From 4f9532c78aa185614f2de3f014a65251b57ecf23 Mon Sep 17 00:00:00 2001 From: rashmy Date: Fri, 29 Mar 2019 17:52:52 -0700 Subject: [PATCH 38/38] PR comments --- installer/conf/kube.conf | 2 +- source/code/plugin/CAdvisorMetricsAPIClient.rb | 3 +++ source/code/plugin/KubernetesApiClient.rb | 1 + source/code/plugin/in_kube_podinventory.rb | 7 ++++++- source/code/plugin/in_win_cadvisor_perf.rb | 5 ++++- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index b0da9f5b4..0dfa3710e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -65,7 +65,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index f986a3cfd..8b4fd9fcf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -245,6 +245,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @Log.warn "in host: #{hostName}" begin metricInfo = metricJSON + containerCount = 0 metricInfo["pods"].each do |pod| podUid = pod["podRef"]["uid"] podName = pod["podRef"]["name"] @@ -253,6 +254,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, if (!pod["containers"].nil?) pod["containers"].each do |container| #cpu metric + containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = container["cpu"]["time"] @@ -315,6 +317,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i telemetryProperties = {} telemetryProperties["Computer"] = hostName + telemetryProperties["ContainerCount"] = containerCount # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. @Log.info "sending container inventory heartbeat telemetry" ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 9f859529d..4ed85025f 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -257,6 +257,7 @@ def getWindowsNodes return winNodes rescue => error @Log.warn("Error in get windows nodes: #{error}") + return nil end end diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index b904bb14d..65573673c 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -153,7 +153,12 @@ def getContainerEnvironmentVariables(pod) envVarsArray.push(envArrayElement) end end - containerEnvHash[container["name"]] = envVarsArray.to_s + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + envValueString = envVarsArray.to_s + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + end + containerEnvHash[container["name"]] = envValueString end end return containerEnvHash diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index b30f5352d..2e5f839e6 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -58,7 +58,10 @@ def enumerate() CAdvisorMetricsAPIClient.resetWinContainerIdCache() if (timeDifferenceInMinutes >= 5) $log.info "in_win_cadvisor_perf: Getting windows nodes" - @@winNodes = KubernetesApiClient.getWindowsNodes() + nodes = KubernetesApiClient.getWindowsNodes() + if !nodes.nil? + @@winNodes = KubernetesApiClient.getWindowsNodes() + end $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end