microsoft · vishiy · Feb 25, 2020 · Feb 22, 2020 · Feb 25, 2020
diff --git a/installer/conf/container.conf b/installer/conf/container.conf
@@ -112,3 +112,17 @@
   max_retry_wait 5m
   retry_mdm_post_wait_minutes 60
 </match>
+
+<match oms.api.InsightsMetrics**>
+  type out_oms
+  log_level debug
+  num_threads 5
+  buffer_type file
+  buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+  buffer_queue_full_action drop_oldest_chunk
+  buffer_chunk_limit 4m
+  flush_interval 20s
+  retry_limit 10
+  retry_wait 5s
+  max_retry_wait 5m
+</match>
diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf
@@ -215,4 +215,19 @@
      retry_limit 10
      retry_wait 5s
      max_retry_wait 5m
+    </match>
+
+    <match oms.api.InsightsMetrics**>
+     type out_oms
+     log_level debug
+     num_threads 5
+     buffer_chunk_limit 4m
+     buffer_type file
+     buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
+     buffer_queue_limit 20
+     buffer_queue_full_action drop_oldest_chunk
+     flush_interval 20s
+     retry_limit 10
+     retry_wait 5s
+     max_retry_wait 5m
     </match>
diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data
@@ -36,7 +36,7 @@ MAINTAINER:              'Microsoft Corporation'
 /opt/microsoft/omsagent/plugin/in_kube_nodes.rb;			        source/code/plugin/in_kube_nodes.rb;			    	644; root; root
 /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb;			     source/code/plugin/filter_inventory2mdm.rb;			    	644; root; root
 /opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb;			     source/code/plugin/CustomMetricsUtils.rb;			    	644; root; root
-
+/opt/microsoft/omsagent/plugin/constants.rb;			     source/code/plugin/constants.rb;			    	644; root; root
 
 /opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb;									source/code/plugin/ApplicationInsightsUtility.rb;	644; root; root
 /opt/microsoft/omsagent/plugin/ContainerInventoryState.rb;										source/code/plugin/ContainerInventoryState.rb;		644; root; root

diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb
@@ -13,6 +13,7 @@ class CAdvisorMetricsAPIClient
   require_relative "oms_common"
   require_relative "KubernetesApiClient"
   require_relative "ApplicationInsightsUtility"
+  require_relative "constants"
 
   @configMapMountPath = "/etc/config/settings/log-data-collection-settings"
   @promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings"
@@ -282,6 +283,101 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met
       return metricItems
     end
 
+    def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601)
+      metricDataItems = []
+      begin
+        cAdvisorStats = getSummaryStatsFromCAdvisor(winNode)
+        if !cAdvisorStats.nil?
+          metricInfo = JSON.parse(cAdvisorStats.body)
+        end
+        if !winNode.nil?
+          hostName = winNode["Hostname"]
+          operatingSystem = "Windows"
+        else
+          if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil?
+            hostName = metricInfo["node"]["nodeName"]
+          else
+            hostName = (OMS::Common.get_hostname)
+          end
+          operatingSystem = "Linux"
+        end
+        if !metricInfo.nil?
+          metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime))
+          metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime))
+          metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime))
+        else
+          @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}")
+        end
+      rescue => error
+        @Log.warn("CAdvisorMetricsAPIClient::getInsightsMetrics failed: #{error}")
+        return metricDataItems
+      end
+      return metricDataItems
+    end
+
+    def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime)
+      metricItems = []
+      clusterId = KubernetesApiClient.getClusterId
+      clusterName = KubernetesApiClient.getClusterName
+      begin
+        metricInfo = metricJSON
+        metricInfo["pods"].each do |pod|
+          podUid = pod["podRef"]["uid"]
+          podName = pod["podRef"]["name"]
+          podNamespace = pod["podRef"]["namespace"]
+
+          if (!pod["containers"].nil?)
+            pod["containers"].each do |container|
+              #gpu metrics
+              if (!container["accelerators"].nil?)
+                container["accelerators"].each do |accelerator|
+                  if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings
+                    containerName = container["name"]
+                    metricValue = accelerator[metricNameToCollect]
+
+
+                    metricItem = {}
+                    metricItem["CollectionTime"] = metricPollTime
+                    metricItem["Computer"] = hostName
+                    metricItem["Name"] = metricNametoReturn
+                    metricItem["Value"] = metricValue
+                    metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN 
+                    metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+                    metricTags = {}
+                    metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+                    metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+                    metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
+                    #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace
+
+                    if (!accelerator["make"].nil? && !accelerator["make"].empty?)
+                      metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = accelerator["make"]
+                    end
+
+                    if (!accelerator["model"].nil? && !accelerator["model"].empty?)
+                      metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_MODEL] = accelerator["model"]
+                    end
+
+                    if (!accelerator["id"].nil? && !accelerator["id"].empty?)
+                      metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"]
+                    end
+
+                    metricItem["Tags"] = metricTags
+
+                    metricItems.push(metricItem)
+                  end
+                end
+              end
+            end
+          end
+        end
+      rescue => errorStr
+        @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}")
+        return metricItems
+      end
+      return metricItems
+    end
+
     def clearDeletedWinContainersFromCache()
       begin
         winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys

diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb
@@ -10,6 +10,7 @@ class KubernetesApiClient
   require "time"
 
   require_relative "oms_common"
+  require_relative "constants"
 
   @@ApiVersion = "v1"
   @@ApiVersionApps = "v1"
@@ -430,6 +431,87 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName
       return metricItems
     end #getContainerResourceRequestAndLimits
 
+    def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+      metricItems = []
+      begin
+        clusterId = getClusterId
+        clusterName = getClusterName
+
+        metricInfo = metricJSON
+        metricInfo["items"].each do |pod|
+          podNameSpace = pod["metadata"]["namespace"]
+          if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences")
+            # The above case seems to be the only case where you have horizontal scaling of pods
+            # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash
+            # instead of the actual poduid. Since this uid is not being surface into the UX
+            # its ok to use this.
+            # Use kubernetes.io/config.hash to be able to correlate with cadvisor data
+            if pod["metadata"]["annotations"].nil?
+              next
+            else
+              podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"]
+            end
+          else
+            podUid = pod["metadata"]["uid"]
+          end
+
+          podContainers = []
+          if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty?
+            podContainers = podContainers + pod["spec"]["containers"]
+          end
+          # Adding init containers to the record list as well.
+          if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty?
+            podContainers = podContainers + pod["spec"]["initContainers"]
+          end
+
+          if (!podContainers.nil? && !podContainers.empty?)
+            if (!pod["spec"]["nodeName"].nil?)
+              nodeName = pod["spec"]["nodeName"]
+            else
+              nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU
+            end
+            podContainers.each do |container|
+              metricValue = nil
+              containerName = container["name"]
+              #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+              if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?)
+                metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect])
+              else 
+                #No container level limit for the given metric, so default to node level limit for non-gpu metrics
+                if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+                  nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
+                  metricValue = @@NodeMetrics[nodeMetricsHashKey]
+                end
+              end
+              if (!metricValue.nil?)
+                metricItem = {}
+                metricItem["CollectionTime"] = metricTime
+                metricItem["Computer"] = nodeName
+                metricItem["Name"] = metricNametoReturn
+                metricItem["Value"] = metricValue
+                metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN 
+                metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+                metricTags = {}
+                metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+                metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+                metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
+                #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace
+
+                metricItem["Tags"] = metricTags
+
+                metricItems.push(metricItem)
+              end
+            end
+          end
+        end
+      rescue => error
+        @Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+        return metricItems
+      end
+      return metricItems
+    end #getContainerResourceRequestAndLimitsAsInsightsMetrics
+
     def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
       metricItems = []
       begin
@@ -473,6 +555,51 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet
       return metricItems
     end #parseNodeLimits
 
+    def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
+      metricItems = []
+      begin
+        metricInfo = metricJSON
+        clusterId = getClusterId
+        clusterName = getClusterName
+        #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
+        #if we are coming up with the time it should be same for all nodes
+        #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
+        metricInfo["items"].each do |node|
+          if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?)
+
+            # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu"
+            metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect])
+
+            metricItem = {}
+            metricItem["CollectionTime"] = metricTime
+            metricItem["Computer"] = node["metadata"]["name"]
+            metricItem["Name"] = metricNametoReturn
+            metricItem["Value"] = metricValue
+            metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN 
+            metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE
+
+            metricTags = {}
+            metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
+            metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
+            metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect
+
+            metricItem["Tags"] = metricTags
+
+            metricItems.push(metricItem)
+            #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
+            #Currently if container level cpu & memory limits are not defined we default to node level limits
+            if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
+              @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
+              #@Log.info ("Node metric hash: #{@@NodeMetrics}")
+            end 
+          end
+        end
+      rescue => error
+        @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
+      end
+      return metricItems
+    end 
+
     def getMetricNumericValue(metricName, metricVal)
       metricValue = metricVal.downcase
       begin
@@ -538,6 +665,10 @@ def getMetricNumericValue(metricName, metricVal)
           else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units')
             metricValue = Float(metricValue) * 1000.0 ** 3
           end
+        when "nvidia.com/gpu"
+          metricValue = Float(metricValue) * 1.0
+        when "amd.com/gpu"
+          metricValue = Float(metricValue) * 1.0
         else
           @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value")
           metricValue = 0

diff --git a/source/code/plugin/constants.rb b/source/code/plugin/constants.rb
@@ -0,0 +1,15 @@
+class Constants
+    INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
+    INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
+    INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
+    INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
+    INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
+    INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
+    INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
+    INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
+    INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
+    INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
+    INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
+    INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
+    INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
+end
diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb
@@ -15,6 +15,7 @@ def initialize
       require_relative "CAdvisorMetricsAPIClient"
       require_relative "oms_common"
       require_relative "omslog"
+      require_relative "constants"
     end
 
     config_param :run_interval, :time, :default => 60
@@ -50,8 +51,10 @@ def enumerate()
       currentTime = Time.now
       time = currentTime.to_f
       batchTime = currentTime.utc.iso8601
+      @@istestvar = ENV["ISTEST"]
       begin
         eventStream = MultiEventStream.new
+        insightsMetricsEventStream = MultiEventStream.new
         metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime )
         metricData.each do |record|
           record["DataType"] = "LINUX_PERF_BLOB"
@@ -64,10 +67,38 @@ def enumerate()
         router.emit_stream(@containerhealthtag, eventStream) if eventStream
         router.emit_stream(@nodehealthtag, eventStream) if eventStream
 
-        @@istestvar = ENV["ISTEST"]
+
         if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
           $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}")
         end
+
+        #start GPU InsightsMetrics items
+        begin
+          containerGPUusageInsightsMetricsDataItems = []
+          containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime))
+
+
+          containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord|
+            wrapper = {
+              "DataType" => "INSIGHTS_METRICS_BLOB",
+              "IPName" => "ContainerInsights",
+              "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
+            }
+            insightsMetricsEventStream.add(time, wrapper) if wrapper
+          end
+
+          router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream
+
+          if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
+            $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
+          end
+        rescue => errorStr
+          $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}"
+          $log.debug_backtrace(errorStr.backtrace)
+          ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
+        end 
+        #end GPU InsightsMetrics items
+
       rescue => errorStr
         $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}"
         $log.debug_backtrace(errorStr.backtrace)