diff --git a/installer/conf/container.conf b/installer/conf/container.conf
index 93c250fbb..fe74e176b 100644
--- a/installer/conf/container.conf
+++ b/installer/conf/container.conf
@@ -112,3 +112,17 @@
max_retry_wait 5m
retry_mdm_post_wait_minutes 60
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+ buffer_queue_full_action drop_oldest_chunk
+ buffer_chunk_limit 4m
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
+
diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf
index 207780442..b2f912191 100644
--- a/installer/conf/kube.conf
+++ b/installer/conf/kube.conf
@@ -215,4 +215,19 @@
retry_limit 10
retry_wait 5s
max_retry_wait 5m
+
+
+
+ type out_oms
+ log_level debug
+ num_threads 5
+ buffer_chunk_limit 4m
+ buffer_type file
+ buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+ buffer_queue_limit 20
+ buffer_queue_full_action drop_oldest_chunk
+ flush_interval 20s
+ retry_limit 10
+ retry_wait 5s
+ max_retry_wait 5m
\ No newline at end of file
diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data
index 60de5af18..05466bf63 100644
--- a/installer/datafiles/base_container.data
+++ b/installer/datafiles/base_container.data
@@ -36,7 +36,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root
/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root
/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root
-
+/opt/microsoft/omsagent/plugin/constants.rb; source/code/plugin/constants.rb; 644; root; root
/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root
/opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root
diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb
index 8b0105a6f..aea4c4ddc 100644
--- a/source/code/plugin/CAdvisorMetricsAPIClient.rb
+++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb
@@ -13,6 +13,7 @@ class CAdvisorMetricsAPIClient
require_relative "oms_common"
require_relative "KubernetesApiClient"
require_relative "ApplicationInsightsUtility"
+ require_relative "constants"
@configMapMountPath = "/etc/config/settings/log-data-collection-settings"
@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings"
@@ -282,6 +283,101 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met
return metricItems
end
+ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601)
+ metricDataItems = []
+ begin
+ cAdvisorStats = getSummaryStatsFromCAdvisor(winNode)
+ if !cAdvisorStats.nil?
+ metricInfo = JSON.parse(cAdvisorStats.body)
+ end
+ if !winNode.nil?
+ hostName = winNode["Hostname"]
+ operatingSystem = "Windows"
+ else
+ if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil?
+ hostName = metricInfo["node"]["nodeName"]
+ else
+ hostName = (OMS::Common.get_hostname)
+ end
+ operatingSystem = "Linux"
+ end
+ if !metricInfo.nil?
+ metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime))
+ metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime))
+ metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime))
+ else
+ @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}")
+ end
+ rescue => error
+ @Log.warn("CAdvisorMetricsAPIClient::getInsightsMetrics failed: #{error}")
+ return metricDataItems
+ end
+ return metricDataItems
+ end
+
+ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime)
+ metricItems = []
+ clusterId = KubernetesApiClient.getClusterId
+ clusterName = KubernetesApiClient.getClusterName
+ begin
+ metricInfo = metricJSON
+ metricInfo["pods"].each do |pod|
+ podUid = pod["podRef"]["uid"]
+ podName = pod["podRef"]["name"]
+ podNamespace = pod["podRef"]["namespace"]
+
+ if (!pod["containers"].nil?)
+ pod["containers"].each do |container|
+ #gpu metrics
+ if (!container["accelerators"].nil?)
+ container["accelerators"].each do |accelerator|
+ if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings
+ containerName = container["name"]
+ metricValue = accelerator[metricNameToCollect]
+
+
+ metricItem = {}
+ metricItem["CollectionTime"] = metricPollTime
+ metricItem["Computer"] = hostName
+ metricItem["Name"] = metricNametoReturn
+ metricItem["Value"] = metricValue
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
+ #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace
+
+ if (!accelerator["make"].nil? && !accelerator["make"].empty?)
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = accelerator["make"]
+ end
+
+ if (!accelerator["model"].nil? && !accelerator["model"].empty?)
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_MODEL] = accelerator["model"]
+ end
+
+ if (!accelerator["id"].nil? && !accelerator["id"].empty?)
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"]
+ end
+
+ metricItem["Tags"] = metricTags
+
+ metricItems.push(metricItem)
+ end
+ end
+ end
+ end
+ end
+ end
+ rescue => errorStr
+ @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}")
+ return metricItems
+ end
+ return metricItems
+ end
+
def clearDeletedWinContainersFromCache()
begin
winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys
diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb
index e52c77884..b9418f149 100644
--- a/source/code/plugin/KubernetesApiClient.rb
+++ b/source/code/plugin/KubernetesApiClient.rb
@@ -10,6 +10,7 @@ class KubernetesApiClient
require "time"
require_relative "oms_common"
+ require_relative "constants"
@@ApiVersion = "v1"
@@ApiVersionApps = "v1"
@@ -430,6 +431,87 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName
return metricItems
end #getContainerResourceRequestAndLimits
+ def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItems = []
+ begin
+ clusterId = getClusterId
+ clusterName = getClusterName
+
+ metricInfo = metricJSON
+ metricInfo["items"].each do |pod|
+ podNameSpace = pod["metadata"]["namespace"]
+ if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences")
+ # The above case seems to be the only case where you have horizontal scaling of pods
+ # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash
+ # instead of the actual poduid. Since this uid is not being surface into the UX
+ # its ok to use this.
+ # Use kubernetes.io/config.hash to be able to correlate with cadvisor data
+ if pod["metadata"]["annotations"].nil?
+ next
+ else
+ podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"]
+ end
+ else
+ podUid = pod["metadata"]["uid"]
+ end
+
+ podContainers = []
+ if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty?
+ podContainers = podContainers + pod["spec"]["containers"]
+ end
+ # Adding init containers to the record list as well.
+ if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty?
+ podContainers = podContainers + pod["spec"]["initContainers"]
+ end
+
+ if (!podContainers.nil? && !podContainers.empty?)
+ if (!pod["spec"]["nodeName"].nil?)
+ nodeName = pod["spec"]["nodeName"]
+ else
+ nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU
+ end
+ podContainers.each do |container|
+ metricValue = nil
+ containerName = container["name"]
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?)
+ metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect])
+ else
+ #No container level limit for the given metric, so default to node level limit for non-gpu metrics
+ if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+ nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
+ metricValue = @@NodeMetrics[nodeMetricsHashKey]
+ end
+ end
+ if (!metricValue.nil?)
+ metricItem = {}
+ metricItem["CollectionTime"] = metricTime
+ metricItem["Computer"] = nodeName
+ metricItem["Name"] = metricNametoReturn
+ metricItem["Value"] = metricValue
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
+ #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace
+
+ metricItem["Tags"] = metricTags
+
+ metricItems.push(metricItem)
+ end
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ return metricItems
+ end
+ return metricItems
+ end #getContainerResourceRequestAndLimitsAsInsightsMetrics
+
def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
@@ -473,6 +555,51 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet
return metricItems
end #parseNodeLimits
+ def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+ metricItems = []
+ begin
+ metricInfo = metricJSON
+ clusterId = getClusterId
+ clusterName = getClusterName
+ #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
+ #if we are coming up with the time it should be same for all nodes
+ #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+ metricInfo["items"].each do |node|
+ if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?)
+
+ # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu"
+ metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect])
+
+ metricItem = {}
+ metricItem["CollectionTime"] = metricTime
+ metricItem["Computer"] = node["metadata"]["name"]
+ metricItem["Name"] = metricNametoReturn
+ metricItem["Value"] = metricValue
+ metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
+ metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+ metricTags = {}
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+ metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect
+
+ metricItem["Tags"] = metricTags
+
+ metricItems.push(metricItem)
+ #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
+ #Currently if container level cpu & memory limits are not defined we default to node level limits
+ if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+ @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
+ #@Log.info ("Node metric hash: #{@@NodeMetrics}")
+ end
+ end
+ end
+ rescue => error
+ @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+ end
+ return metricItems
+ end
+
def getMetricNumericValue(metricName, metricVal)
metricValue = metricVal.downcase
begin
@@ -538,6 +665,10 @@ def getMetricNumericValue(metricName, metricVal)
else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units')
metricValue = Float(metricValue) * 1000.0 ** 3
end
+ when "nvidia.com/gpu"
+ metricValue = Float(metricValue) * 1.0
+ when "amd.com/gpu"
+ metricValue = Float(metricValue) * 1.0
else
@Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value")
metricValue = 0
diff --git a/source/code/plugin/constants.rb b/source/code/plugin/constants.rb
new file mode 100644
index 000000000..20114ea2b
--- /dev/null
+++ b/source/code/plugin/constants.rb
@@ -0,0 +1,15 @@
+class Constants
+ INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
+ INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
+ INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
+ INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
+ INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
+ INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
+ INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
+ INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
+ INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
+ INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
+ INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
+ INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
+ INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
+end
\ No newline at end of file
diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb
index 96aa66aa1..a44365e9d 100644
--- a/source/code/plugin/in_cadvisor_perf.rb
+++ b/source/code/plugin/in_cadvisor_perf.rb
@@ -15,6 +15,7 @@ def initialize
require_relative "CAdvisorMetricsAPIClient"
require_relative "oms_common"
require_relative "omslog"
+ require_relative "constants"
end
config_param :run_interval, :time, :default => 60
@@ -50,8 +51,10 @@ def enumerate()
currentTime = Time.now
time = currentTime.to_f
batchTime = currentTime.utc.iso8601
+ @@istestvar = ENV["ISTEST"]
begin
eventStream = MultiEventStream.new
+ insightsMetricsEventStream = MultiEventStream.new
metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime )
metricData.each do |record|
record["DataType"] = "LINUX_PERF_BLOB"
@@ -64,10 +67,38 @@ def enumerate()
router.emit_stream(@containerhealthtag, eventStream) if eventStream
router.emit_stream(@nodehealthtag, eventStream) if eventStream
- @@istestvar = ENV["ISTEST"]
+
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
+
+ #start GPU InsightsMetrics items
+ begin
+ containerGPUusageInsightsMetricsDataItems = []
+ containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime))
+
+
+ containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ wrapper = {
+ "DataType" => "INSIGHTS_METRICS_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+ }
+ insightsMetricsEventStream.add(time, wrapper) if wrapper
+ end
+
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
+ $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ rescue => errorStr
+ $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ #end GPU InsightsMetrics items
+
rescue => errorStr
$log.warn "Failed to retrieve cadvisor metric data: #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb
index fa0994f43..6ca299408 100644
--- a/source/code/plugin/in_kube_nodes.rb
+++ b/source/code/plugin/in_kube_nodes.rb
@@ -31,6 +31,7 @@ def initialize
require_relative "oms_common"
require_relative "omslog"
@NODES_CHUNK_SIZE = "400"
+ require_relative "constants"
end
config_param :run_interval, :time, :default => 60
@@ -103,6 +104,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
telemetrySent = false
eventStream = MultiEventStream.new
containerNodeInventoryEventStream = MultiEventStream.new
+ insightsMetricsEventStream = MultiEventStream.new
+ @@istestvar = ENV["ISTEST"]
#get node inventory
nodeInventory["items"].each do |items|
record = {}
@@ -191,6 +194,20 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
capacityInfo = items["status"]["capacity"]
ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)
+ begin
+ if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?)
+ properties["nvigpus"] = capacityInfo["nvidia.com/gpu"]
+ end
+
+ if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?)
+ properties["amdgpus"] = capacityInfo["amd.com/gpu"]
+ end
+ rescue => errorStr
+ $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+
#telemetry about prometheus metric collections settings for replicaset
if (File.file?(@@promConfigMountPath))
properties["rsPromInt"] = @@rsPromInterval
@@ -211,7 +228,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
if telemetrySent == true
@@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i
end
- @@istestvar = ENV["ISTEST"]
+
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
@@ -235,6 +252,35 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601)
end
#end
router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+
+ #start GPU InsightsMetrics items
+ begin
+ nodeGPUInsightsMetricsDataItems = []
+ nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime))
+ nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime))
+
+ nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime))
+ nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime))
+
+ nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ wrapper = {
+ "DataType" => "INSIGHTS_METRICS_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+ }
+ insightsMetricsEventStream.add(emitTime, wrapper) if wrapper
+ end
+
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
+ $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ rescue => errorStr
+ $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ #end GPU InsightsMetrics items
rescue => errorStr
$log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb
index 28b20bfc0..55371a292 100644
--- a/source/code/plugin/in_kube_podinventory.rb
+++ b/source/code/plugin/in_kube_podinventory.rb
@@ -22,6 +22,7 @@ def initialize
require_relative "ApplicationInsightsUtility"
require_relative "oms_common"
require_relative "omslog"
+ require_relative "constants"
@PODS_CHUNK_SIZE = "1500"
@podCount = 0
@@ -251,6 +252,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86
emitTime = currentTime.to_f
#batchTime = currentTime.utc.iso8601
eventStream = MultiEventStream.new
+ @@istestvar = ENV["ISTEST"]
begin #begin block start
# Getting windows nodes from kubeapi
@@ -488,6 +490,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86
containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime))
kubePerfEventStream = MultiEventStream.new
+ insightsMetricsEventStream = MultiEventStream.new
containerMetricDataItems.each do |record|
record["DataType"] = "LINUX_PERF_BLOB"
@@ -496,6 +499,38 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86
end
#end
router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream
+
+ begin
+ #start GPU InsightsMetrics items
+
+ containerGPUInsightsMetricsDataItems = []
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime))
+
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime))
+ containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime))
+
+ containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ wrapper = {
+ "DataType" => "INSIGHTS_METRICS_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+ }
+ insightsMetricsEventStream.add(emitTime, wrapper) if wrapper
+
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
+ $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+
+ end
+
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ #end GPU InsightsMetrics items
+ rescue => errorStr
+ $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
rescue => errorStr
$log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
@@ -537,7 +572,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86
#Updating value for AppInsights telemetry
@podCount += podInventory["items"].length
- @@istestvar = ENV["ISTEST"]
+
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb
index 695a686cf..38868f2f5 100644
--- a/source/code/plugin/in_win_cadvisor_perf.rb
+++ b/source/code/plugin/in_win_cadvisor_perf.rb
@@ -17,6 +17,7 @@ def initialize
require_relative "KubernetesApiClient"
require_relative "oms_common"
require_relative "omslog"
+ require_relative "constants"
end
config_param :run_interval, :time, :default => 60
@@ -52,8 +53,10 @@ def enumerate()
time = Time.now.to_f
begin
eventStream = MultiEventStream.new
+ insightsMetricsEventStream = MultiEventStream.new
timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs
timeDifferenceInMinutes = timeDifference / 60
+ @@istestvar = ENV["ISTEST"]
#Resetting this cache so that it is populated with the current set of containers with every call
CAdvisorMetricsAPIClient.resetWinContainerIdCache()
@@ -78,10 +81,36 @@ def enumerate()
router.emit_stream(@tag, eventStream) if eventStream
router.emit_stream(@mdmtag, eventStream) if eventStream
- @@istestvar = ENV["ISTEST"]
+
if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
+
+ #start GPU InsightsMetrics items
+ begin
+ containerGPUusageInsightsMetricsDataItems = []
+ containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601))
+
+ containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord|
+ wrapper = {
+ "DataType" => "INSIGHTS_METRICS_BLOB",
+ "IPName" => "ContainerInsights",
+ "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+ }
+ insightsMetricsEventStream.add(time, wrapper) if wrapper
+ end
+
+ router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+ if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
+ $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+ end
+ rescue => errorStr
+ $log.warn "Failed when processing GPU Usage metrics in_win_cadvisor_perf : #{errorStr}"
+ $log.debug_backtrace(errorStr.backtrace)
+ ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+ end
+ #end GPU InsightsMetrics items
+
end
# Cleanup routine to clear deleted containers from cache