Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions installer/conf/container.conf
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,17 @@
max_retry_wait 5m
retry_mdm_post_wait_minutes 60
</match>

<match oms.api.InsightsMetrics**>
type out_oms
log_level debug
num_threads 5
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
buffer_queue_full_action drop_oldest_chunk
buffer_chunk_limit 4m
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>
15 changes: 15 additions & 0 deletions installer/conf/kube.conf
Original file line number Diff line number Diff line change
Expand Up @@ -215,4 +215,19 @@
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>

<match oms.api.InsightsMetrics**>
type out_oms
log_level debug
num_threads 5
buffer_chunk_limit 4m
buffer_type file
buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer
buffer_queue_limit 20
buffer_queue_full_action drop_oldest_chunk
flush_interval 20s
retry_limit 10
retry_wait 5s
max_retry_wait 5m
</match>
2 changes: 1 addition & 1 deletion installer/datafiles/base_container.data
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root
/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root
/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root

/opt/microsoft/omsagent/plugin/constants.rb; source/code/plugin/constants.rb; 644; root; root

/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root
/opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root
Expand Down
96 changes: 96 additions & 0 deletions source/code/plugin/CAdvisorMetricsAPIClient.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class CAdvisorMetricsAPIClient
require_relative "oms_common"
require_relative "KubernetesApiClient"
require_relative "ApplicationInsightsUtility"
require_relative "constants"

@configMapMountPath = "/etc/config/settings/log-data-collection-settings"
@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings"
Expand Down Expand Up @@ -282,6 +283,101 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met
return metricItems
end

def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601)
metricDataItems = []
begin
cAdvisorStats = getSummaryStatsFromCAdvisor(winNode)
if !cAdvisorStats.nil?
metricInfo = JSON.parse(cAdvisorStats.body)
end
if !winNode.nil?
hostName = winNode["Hostname"]
operatingSystem = "Windows"
else
if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil?
hostName = metricInfo["node"]["nodeName"]
else
hostName = (OMS::Common.get_hostname)
end
operatingSystem = "Linux"
end
if !metricInfo.nil?
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime))
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime))
metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime))
else
@Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}")
end
rescue => error
@Log.warn("CAdvisorMetricsAPIClient::getInsightsMetrics failed: #{error}")
return metricDataItems
end
return metricDataItems
end

def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime)
metricItems = []
clusterId = KubernetesApiClient.getClusterId
clusterName = KubernetesApiClient.getClusterName
begin
metricInfo = metricJSON
metricInfo["pods"].each do |pod|
podUid = pod["podRef"]["uid"]
podName = pod["podRef"]["name"]
podNamespace = pod["podRef"]["namespace"]

if (!pod["containers"].nil?)
pod["containers"].each do |container|
#gpu metrics
if (!container["accelerators"].nil?)
container["accelerators"].each do |accelerator|
if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings
containerName = container["name"]
metricValue = accelerator[metricNameToCollect]


metricItem = {}
metricItem["CollectionTime"] = metricPollTime
metricItem["Computer"] = hostName
metricItem["Name"] = metricNametoReturn
metricItem["Value"] = metricValue
metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE

metricTags = {}
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
#metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace

if (!accelerator["make"].nil? && !accelerator["make"].empty?)
metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = accelerator["make"]
end

if (!accelerator["model"].nil? && !accelerator["model"].empty?)
metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_MODEL] = accelerator["model"]
end

if (!accelerator["id"].nil? && !accelerator["id"].empty?)
metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"]
end

metricItem["Tags"] = metricTags

metricItems.push(metricItem)
end
end
end
end
end
end
rescue => errorStr
@Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}")
return metricItems
end
return metricItems
end

def clearDeletedWinContainersFromCache()
begin
winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys
Expand Down
131 changes: 131 additions & 0 deletions source/code/plugin/KubernetesApiClient.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class KubernetesApiClient
require "time"

require_relative "oms_common"
require_relative "constants"

@@ApiVersion = "v1"
@@ApiVersionApps = "v1"
Expand Down Expand Up @@ -430,6 +431,87 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName
return metricItems
end #getContainerResourceRequestAndLimits

def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
clusterId = getClusterId
clusterName = getClusterName

metricInfo = metricJSON
metricInfo["items"].each do |pod|
podNameSpace = pod["metadata"]["namespace"]
if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences")
# The above case seems to be the only case where you have horizontal scaling of pods
# but no controller, in which case cAdvisor picks up kubernetes.io/config.hash
# instead of the actual poduid. Since this uid is not being surface into the UX
# its ok to use this.
# Use kubernetes.io/config.hash to be able to correlate with cadvisor data
if pod["metadata"]["annotations"].nil?
next
else
podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"]
end
else
podUid = pod["metadata"]["uid"]
end

podContainers = []
if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty?
podContainers = podContainers + pod["spec"]["containers"]
end
# Adding init containers to the record list as well.
if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty?
podContainers = podContainers + pod["spec"]["initContainers"]
end

if (!podContainers.nil? && !podContainers.empty?)
if (!pod["spec"]["nodeName"].nil?)
nodeName = pod["spec"]["nodeName"]
else
nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU
end
podContainers.each do |container|
metricValue = nil
containerName = container["name"]
#metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?)
metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect])
else
#No container level limit for the given metric, so default to node level limit for non-gpu metrics
if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect
metricValue = @@NodeMetrics[nodeMetricsHashKey]
end
end
if (!metricValue.nil?)
metricItem = {}
metricItem["CollectionTime"] = metricTime
metricItem["Computer"] = nodeName
metricItem["Name"] = metricNametoReturn
metricItem["Value"] = metricValue
metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE

metricTags = {}
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName
#metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace

metricItem["Tags"] = metricTags

metricItems.push(metricItem)
end
end
end
end
rescue => error
@Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
return metricItems
end
return metricItems
end #getContainerResourceRequestAndLimitsAsInsightsMetrics

def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
Expand Down Expand Up @@ -473,6 +555,51 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet
return metricItems
end #parseNodeLimits

def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601)
metricItems = []
begin
metricInfo = metricJSON
clusterId = getClusterId
clusterName = getClusterName
#Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics,
#if we are coming up with the time it should be same for all nodes
#metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z
metricInfo["items"].each do |node|
if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?)

# metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu"
metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect])

metricItem = {}
metricItem["CollectionTime"] = metricTime
metricItem["Computer"] = node["metadata"]["name"]
metricItem["Name"] = metricNametoReturn
metricItem["Value"] = metricValue
metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN
metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE

metricTags = {}
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId
metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName
metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect

metricItem["Tags"] = metricTags

metricItems.push(metricItem)
#push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level.
#Currently if container level cpu & memory limits are not defined we default to node level limits
if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu")
@@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue
#@Log.info ("Node metric hash: #{@@NodeMetrics}")
end
end
end
rescue => error
@Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}")
end
return metricItems
end

def getMetricNumericValue(metricName, metricVal)
metricValue = metricVal.downcase
begin
Expand Down Expand Up @@ -538,6 +665,10 @@ def getMetricNumericValue(metricName, metricVal)
else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units')
metricValue = Float(metricValue) * 1000.0 ** 3
end
when "nvidia.com/gpu"
metricValue = Float(metricValue) * 1.0
when "amd.com/gpu"
metricValue = Float(metricValue) * 1.0
else
@Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value")
metricValue = 0
Expand Down
15 changes: 15 additions & 0 deletions source/code/plugin/constants.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class Constants
INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
end
33 changes: 32 additions & 1 deletion source/code/plugin/in_cadvisor_perf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def initialize
require_relative "CAdvisorMetricsAPIClient"
require_relative "oms_common"
require_relative "omslog"
require_relative "constants"
end

config_param :run_interval, :time, :default => 60
Expand Down Expand Up @@ -50,8 +51,10 @@ def enumerate()
currentTime = Time.now
time = currentTime.to_f
batchTime = currentTime.utc.iso8601
@@istestvar = ENV["ISTEST"]
begin
eventStream = MultiEventStream.new
insightsMetricsEventStream = MultiEventStream.new
metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime )
metricData.each do |record|
record["DataType"] = "LINUX_PERF_BLOB"
Expand All @@ -64,10 +67,38 @@ def enumerate()
router.emit_stream(@containerhealthtag, eventStream) if eventStream
router.emit_stream(@nodehealthtag, eventStream) if eventStream

@@istestvar = ENV["ISTEST"]

if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0)
$log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end

#start GPU InsightsMetrics items
begin
containerGPUusageInsightsMetricsDataItems = []
containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime))


containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord|
wrapper = {
"DataType" => "INSIGHTS_METRICS_BLOB",
"IPName" => "ContainerInsights",
"DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }],
}
insightsMetricsEventStream.add(time, wrapper) if wrapper
end

router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream

if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0)
$log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}")
end
rescue => errorStr
$log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
end
#end GPU InsightsMetrics items

rescue => errorStr
$log.warn "Failed to retrieve cadvisor metric data: #{errorStr}"
$log.debug_backtrace(errorStr.backtrace)
Expand Down
Loading