Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions charts/azuremonitor-containers/templates/omsagent-crd.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }}
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
Expand All @@ -10,3 +11,26 @@ spec:
names:
plural: healthstates
kind: HealthState
{{- else }}
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: healthstates.azmon.container.insights
namespace: kube-system
spec:
group: azmon.container.insights
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
state:
type: string
scope: Namespaced
names:
plural: healthstates
kind: HealthState
{{- end }}
14 changes: 12 additions & 2 deletions kubernetes/omsagent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -746,14 +746,24 @@ spec:
port: 25227
targetPort: in-rs-tcp
---
apiVersion: apiextensions.k8s.io/v1beta1
# this is for versions >=1.19, for versions <1.19 we continue to use v1beta1
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: healthstates.azmon.container.insights
namespace: kube-system
spec:
group: azmon.container.insights
version: v1
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
state:
type: string
scope: Namespaced
names:
plural: healthstates
Expand Down
77 changes: 48 additions & 29 deletions source/plugins/ruby/MdmMetricsGenerator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ class MdmMetricsGenerator
require_relative "MdmAlertTemplates"
require_relative "ApplicationInsightsUtility"
require_relative "constants"
require_relative "oms_common"

@log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log"
@log = Logger.new(@log_path, 1, 5000000)
@@hostName = (OMS::Common.get_hostname)

@oom_killed_container_count_hash = {}
@container_restart_count_hash = {}
Expand Down Expand Up @@ -38,11 +40,12 @@ class MdmMetricsGenerator
}

@@pod_metric_name_metric_percentage_name_hash = {
Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC
Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC,
}

# Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails
@sendZeroFilledMetrics = true
@zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i

def initialize
end
Expand Down Expand Up @@ -179,6 +182,19 @@ def zeroFillMetricRecords(records, batch_time)
if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty?
records.push(containerMemoryWorkingSetRecord[0])
end

pvZeroFillDims = {}
pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL
pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL
pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time,
Constants::PV_USED_BYTES,
@@hostName,
0,
pvZeroFillDims,
metric_threshold_hash[Constants::PV_USED_BYTES])
if !pvResourceUtilMetricRecord.nil? && !pvResourceUtilMetricRecord.empty? && !pvResourceUtilMetricRecord[0].nil? && !pvResourceUtilMetricRecord[0].empty?
records.push(pvResourceUtilMetricRecord[0])
end
rescue => errorStr
@log.info "Error in zeroFillMetricRecords: #{errorStr}"
ApplicationInsightsUtility.sendExceptionTelemetry(errorStr)
Expand All @@ -189,10 +205,13 @@ def zeroFillMetricRecords(records, batch_time)
def appendAllPodMetrics(records, batch_time)
begin
@log.info "in appendAllPodMetrics..."
if @sendZeroFilledMetrics == true
timeDifference = (DateTime.now.to_time.to_i - @zeroFilledMetricsTimeTracker).abs
timeDifferenceInMinutes = timeDifference / 60
if @sendZeroFilledMetrics == true || (timeDifferenceInMinutes >= Constants::ZERO_FILL_METRICS_INTERVAL_IN_MINUTES)
records = zeroFillMetricRecords(records, batch_time)
# Setting it to false after startup
@sendZeroFilledMetrics = false
@zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i
end
records = appendPodMetrics(records,
Constants::MDM_OOM_KILLED_CONTAINER_COUNT,
Expand Down Expand Up @@ -325,22 +344,22 @@ def getMetricRecords(record)
begin
dimNames = String.new "" #mutable string
dimValues = String.new ""
noDimVal ="-"
noDimVal = "-"
metricValue = 0
if !record["tags"].nil?
dimCount = 0
record["tags"].each { |k, v|
dimCount = dimCount+1
if (dimCount <= 10) #MDM = 10 dims
dimNames.concat("\"#{k}\"")
dimNames.concat(",")
if !v.nil? && v.length >0
dimValues.concat("\"#{v}\"")
else
dimValues.concat("\"#{noDimVal}\"")
end
dimValues.concat(",")
dimCount = 0
record["tags"].each { |k, v|
dimCount = dimCount + 1
if (dimCount <= 10) #MDM = 10 dims
dimNames.concat("\"#{k}\"")
dimNames.concat(",")
if !v.nil? && v.length > 0
dimValues.concat("\"#{v}\"")
else
dimValues.concat("\"#{noDimVal}\"")
end
dimValues.concat(",")
end
}
if (dimNames.end_with?(","))
dimNames.chomp!(",")
Expand All @@ -353,19 +372,19 @@ def getMetricRecords(record)
convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601
if !record["fields"].nil?
record["fields"].each { |k, v|
if is_numeric(v)
metricRecord = MdmAlertTemplates::Generic_metric_template % {
timestamp: convertedTimestamp,
metricName: k,
namespaceSuffix: record["name"],
dimNames: dimNames,
dimValues: dimValues,
metricValue: v,
}
records.push(Yajl::Parser.parse(StringIO.new(metricRecord)))
#@log.info "pushed mdmgenericmetric: #{k},#{v}"
end
}
if is_numeric(v)
metricRecord = MdmAlertTemplates::Generic_metric_template % {
timestamp: convertedTimestamp,
metricName: k,
namespaceSuffix: record["name"],
dimNames: dimNames,
dimValues: dimValues,
metricValue: v,
}
records.push(Yajl::Parser.parse(StringIO.new(metricRecord)))
#@log.info "pushed mdmgenericmetric: #{k},#{v}"
end
}
end
rescue => errorStr
@log.info "getMetricRecords:Error: #{errorStr} for record #{record}"
Expand All @@ -375,7 +394,7 @@ def getMetricRecords(record)
end

def is_numeric(o)
true if Float(o) rescue false
true if Float(o) rescue false
end

def getContainerResourceUtilizationThresholds
Expand Down
63 changes: 32 additions & 31 deletions source/plugins/ruby/constants.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,36 +57,37 @@ class Constants
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"

CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5
OBJECT_NAME_K8S_CONTAINER = "K8SContainer"
OBJECT_NAME_K8S_NODE = "K8SNode"
CPU_USAGE_NANO_CORES = "cpuUsageNanoCores"
CPU_USAGE_MILLI_CORES = "cpuUsageMillicores"
MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes"
MEMORY_RSS_BYTES = "memoryRssBytes"
PV_USED_BYTES = "pvUsedBytes"
DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
CONTROLLER_KIND_JOB = "job"
CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
CONTAINER_STATE_TERMINATED = "terminated"
STALE_JOB_TIME_IN_MINUTES = 360
TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
OMSAGENT_ZERO_FILL = "omsagent"
KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"
CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5
OBJECT_NAME_K8S_CONTAINER = "K8SContainer"
OBJECT_NAME_K8S_NODE = "K8SNode"
CPU_USAGE_NANO_CORES = "cpuUsageNanoCores"
CPU_USAGE_MILLI_CORES = "cpuUsageMillicores"
MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes"
MEMORY_RSS_BYTES = "memoryRssBytes"
PV_USED_BYTES = "pvUsedBytes"
DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0
DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0
DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0
CONTROLLER_KIND_JOB = "job"
CONTAINER_TERMINATION_REASON_COMPLETED = "completed"
CONTAINER_STATE_TERMINATED = "terminated"
STALE_JOB_TIME_IN_MINUTES = 360
TELEGRAF_DISK_METRICS = "container.azm.ms/disk"
OMSAGENT_ZERO_FILL = "omsagent"
KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system"

#Telemetry constants
CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent"
POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent"
CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent"
PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent"
PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled"
TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10
KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"
#Telemetry constants
CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent"
POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent"
CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent"
PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent"
PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled"
TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10
KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30
MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"

#Pod Statuses
POD_STATUS_TERMINATING = "Terminating"
end
#Pod Statuses
POD_STATUS_TERMINATING = "Terminating"
end