diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 93c250fbb..fe74e176b 100644 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -112,3 +112,17 @@ max_retry_wait 5m retry_mdm_post_wait_minutes 60 + + + type out_oms + log_level debug + num_threads 5 + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer + buffer_queue_full_action drop_oldest_chunk + buffer_chunk_limit 4m + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 207780442..b2f912191 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -215,4 +215,19 @@ retry_limit 10 retry_wait 5s max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 60de5af18..05466bf63 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -36,7 +36,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root /opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root - +/opt/microsoft/omsagent/plugin/constants.rb; source/code/plugin/constants.rb; 644; root; root /opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root /opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 8b0105a6f..aea4c4ddc 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -13,6 +13,7 @@ class CAdvisorMetricsAPIClient require_relative "oms_common" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" + require_relative "constants" @configMapMountPath = "/etc/config/settings/log-data-collection-settings" @promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@ -282,6 +283,101 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met return metricItems end + def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) + metricDataItems = [] + begin + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end + if !winNode.nil? + hostName = winNode["Hostname"] + operatingSystem = "Windows" + else + if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil? + hostName = metricInfo["node"]["nodeName"] + else + hostName = (OMS::Common.get_hostname) + end + operatingSystem = "Linux" + end + if !metricInfo.nil? + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + else + @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") + end + rescue => error + @Log.warn("CAdvisorMetricsAPIClient::getInsightsMetrics failed: #{error}") + return metricDataItems + end + return metricDataItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #gpu metrics + if (!container["accelerators"].nil?) + container["accelerators"].each do |accelerator| + if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings + containerName = container["name"] + metricValue = accelerator[metricNameToCollect] + + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + if (!accelerator["make"].nil? && !accelerator["make"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = accelerator["make"] + end + + if (!accelerator["model"].nil? && !accelerator["model"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_MODEL] = accelerator["model"] + end + + if (!accelerator["id"].nil? && !accelerator["id"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"] + end + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + end + end + rescue => errorStr + @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + return metricItems + end + def clearDeletedWinContainersFromCache() begin winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index e52c77884..b9418f149 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -10,6 +10,7 @@ class KubernetesApiClient require "time" require_relative "oms_common" + require_relative "constants" @@ApiVersion = "v1" @@ApiVersionApps = "v1" @@ -430,6 +431,87 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItems = [] + begin + clusterId = getClusterId + clusterName = getClusterName + + metricInfo = metricJSON + metricInfo["items"].each do |pod| + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + if pod["metadata"]["annotations"].nil? + next + else + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + end + else + podUid = pod["metadata"]["uid"] + end + + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + + if (!podContainers.nil? && !podContainers.empty?) + if (!pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + else + nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU + end + podContainers.each do |container| + metricValue = nil + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + else + #No container level limit for the given metric, so default to node level limit for non-gpu metrics + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + metricValue = @@NodeMetrics[nodeMetricsHashKey] + end + end + if (!metricValue.nil?) + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = nodeName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + rescue => error + @Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + return metricItems + end + return metricItems + end #getContainerResourceRequestAndLimitsAsInsightsMetrics + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin @@ -473,6 +555,51 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet return metricItems end #parseNodeLimits + def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + clusterName = getClusterName + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = node["metadata"]["name"] + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end + end + end + rescue => error + @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end + def getMetricNumericValue(metricName, metricVal) metricValue = metricVal.downcase begin @@ -538,6 +665,10 @@ def getMetricNumericValue(metricName, metricVal) else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') metricValue = Float(metricValue) * 1000.0 ** 3 end + when "nvidia.com/gpu" + metricValue = Float(metricValue) * 1.0 + when "amd.com/gpu" + metricValue = Float(metricValue) * 1.0 else @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") metricValue = 0 diff --git a/source/code/plugin/constants.rb b/source/code/plugin/constants.rb new file mode 100644 index 000000000..20114ea2b --- /dev/null +++ b/source/code/plugin/constants.rb @@ -0,0 +1,15 @@ +class Constants + INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" + INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" + INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" + INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" + INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" + INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" + INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" + INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" + INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" + INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" + INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" + INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" +end \ No newline at end of file diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 96aa66aa1..a44365e9d 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -15,6 +15,7 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -50,8 +51,10 @@ def enumerate() currentTime = Time.now time = currentTime.to_f batchTime = currentTime.utc.iso8601 + @@istestvar = ENV["ISTEST"] begin eventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) metricData.each do |record| record["DataType"] = "LINUX_PERF_BLOB" @@ -64,10 +67,38 @@ def enumerate() router.emit_stream(@containerhealthtag, eventStream) if eventStream router.emit_stream(@nodehealthtag, eventStream) if eventStream - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + + #start GPU InsightsMetrics items + begin + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) + + + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items + rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index fa0994f43..6ca299408 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -31,6 +31,7 @@ def initialize require_relative "oms_common" require_relative "omslog" @NODES_CHUNK_SIZE = "400" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -103,6 +104,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) telemetrySent = false eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |items| record = {} @@ -191,6 +194,20 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + end + + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] + end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #telemetry about prometheus metric collections settings for replicaset if (File.file?(@@promConfigMountPath)) properties["rsPromInt"] = @@rsPromInterval @@ -211,7 +228,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -235,6 +252,35 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end #end router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + + #start GPU InsightsMetrics items + begin + nodeGPUInsightsMetricsDataItems = [] + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime)) + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime)) + + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime)) + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime)) + + nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items rescue => errorStr $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 28b20bfc0..55371a292 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -22,6 +22,7 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" + require_relative "constants" @PODS_CHUNK_SIZE = "1500" @podCount = 0 @@ -251,6 +252,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] begin #begin block start # Getting windows nodes from kubeapi @@ -488,6 +490,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) kubePerfEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new containerMetricDataItems.each do |record| record["DataType"] = "LINUX_PERF_BLOB" @@ -496,6 +499,38 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 end #end router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + + begin + #start GPU InsightsMetrics items + + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) + + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + #end GPU InsightsMetrics items + rescue => errorStr + $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end rescue => errorStr $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -537,7 +572,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 #Updating value for AppInsights telemetry @podCount += podInventory["items"].length - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 695a686cf..38868f2f5 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -17,6 +17,7 @@ def initialize require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -52,8 +53,10 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -78,10 +81,36 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + + #start GPU InsightsMetrics items + begin + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) + + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU Usage metrics in_win_cadvisor_perf : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items + end # Cleanup routine to clear deleted containers from cache