diff --git a/charts/azuremonitor-containers/templates/omsagent-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-crd.yaml index f4a028bd3..bbaf89a52 100644 --- a/charts/azuremonitor-containers/templates/omsagent-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-crd.yaml @@ -1,3 +1,4 @@ +{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }} apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: @@ -10,3 +11,26 @@ spec: names: plural: healthstates kind: HealthState +{{- else }} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: healthstates.azmon.container.insights + namespace: kube-system +spec: + group: azmon.container.insights + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string + scope: Namespaced + names: + plural: healthstates + kind: HealthState +{{- end }} \ No newline at end of file diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 09e50b5a4..e8352e020 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -746,14 +746,24 @@ spec: port: 25227 targetPort: in-rs-tcp --- -apiVersion: apiextensions.k8s.io/v1beta1 +# this is for versions >=1.19, for versions <1.19 we continue to use v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: healthstates.azmon.container.insights namespace: kube-system spec: group: azmon.container.insights - version: v1 + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string scope: Namespaced names: plural: healthstates diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 1e7db37cc..b8104212d 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -8,9 +8,11 @@ class MdmMetricsGenerator require_relative "MdmAlertTemplates" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "oms_common" @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" @log = Logger.new(@log_path, 1, 5000000) + @@hostName = (OMS::Common.get_hostname) @oom_killed_container_count_hash = {} @container_restart_count_hash = {} @@ -38,11 +40,12 @@ class MdmMetricsGenerator } @@pod_metric_name_metric_percentage_name_hash = { - Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC, } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -179,6 +182,19 @@ def zeroFillMetricRecords(records, batch_time) if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty? records.push(containerMemoryWorkingSetRecord[0]) end + + pvZeroFillDims = {} + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL + pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time, + Constants::PV_USED_BYTES, + @@hostName, + 0, + pvZeroFillDims, + metric_threshold_hash[Constants::PV_USED_BYTES]) + if !pvResourceUtilMetricRecord.nil? && !pvResourceUtilMetricRecord.empty? && !pvResourceUtilMetricRecord[0].nil? && !pvResourceUtilMetricRecord[0].empty? + records.push(pvResourceUtilMetricRecord[0]) + end rescue => errorStr @log.info "Error in zeroFillMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -189,10 +205,13 @@ def zeroFillMetricRecords(records, batch_time) def appendAllPodMetrics(records, batch_time) begin @log.info "in appendAllPodMetrics..." - if @sendZeroFilledMetrics == true + timeDifference = (DateTime.now.to_time.to_i - @zeroFilledMetricsTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if @sendZeroFilledMetrics == true || (timeDifferenceInMinutes >= Constants::ZERO_FILL_METRICS_INTERVAL_IN_MINUTES) records = zeroFillMetricRecords(records, batch_time) # Setting it to false after startup @sendZeroFilledMetrics = false + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i end records = appendPodMetrics(records, Constants::MDM_OOM_KILLED_CONTAINER_COUNT, @@ -325,22 +344,22 @@ def getMetricRecords(record) begin dimNames = String.new "" #mutable string dimValues = String.new "" - noDimVal ="-" + noDimVal = "-" metricValue = 0 if !record["tags"].nil? - dimCount = 0 - record["tags"].each { |k, v| - dimCount = dimCount+1 - if (dimCount <= 10) #MDM = 10 dims - dimNames.concat("\"#{k}\"") - dimNames.concat(",") - if !v.nil? && v.length >0 - dimValues.concat("\"#{v}\"") - else - dimValues.concat("\"#{noDimVal}\"") - end - dimValues.concat(",") + dimCount = 0 + record["tags"].each { |k, v| + dimCount = dimCount + 1 + if (dimCount <= 10) #MDM = 10 dims + dimNames.concat("\"#{k}\"") + dimNames.concat(",") + if !v.nil? && v.length > 0 + dimValues.concat("\"#{v}\"") + else + dimValues.concat("\"#{noDimVal}\"") end + dimValues.concat(",") + end } if (dimNames.end_with?(",")) dimNames.chomp!(",") @@ -353,19 +372,19 @@ def getMetricRecords(record) convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601 if !record["fields"].nil? record["fields"].each { |k, v| - if is_numeric(v) - metricRecord = MdmAlertTemplates::Generic_metric_template % { - timestamp: convertedTimestamp, - metricName: k, - namespaceSuffix: record["name"], - dimNames: dimNames, - dimValues: dimValues, - metricValue: v, - } - records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) - #@log.info "pushed mdmgenericmetric: #{k},#{v}" - end - } + if is_numeric(v) + metricRecord = MdmAlertTemplates::Generic_metric_template % { + timestamp: convertedTimestamp, + metricName: k, + namespaceSuffix: record["name"], + dimNames: dimNames, + dimValues: dimValues, + metricValue: v, + } + records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) + #@log.info "pushed mdmgenericmetric: #{k},#{v}" + end + } end rescue => errorStr @log.info "getMetricRecords:Error: #{errorStr} for record #{record}" @@ -375,7 +394,7 @@ def getMetricRecords(record) end def is_numeric(o) - true if Float(o) rescue false + true if Float(o) rescue false end def getContainerResourceUtilizationThresholds diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 73e3af471..be1a9de64 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -57,36 +57,37 @@ class Constants MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" - CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 - OBJECT_NAME_K8S_CONTAINER = "K8SContainer" - OBJECT_NAME_K8S_NODE = "K8SNode" - CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" - CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" - MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" - MEMORY_RSS_BYTES = "memoryRssBytes" - PV_USED_BYTES = "pvUsedBytes" - DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 - DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 - CONTROLLER_KIND_JOB = "job" - CONTAINER_TERMINATION_REASON_COMPLETED = "completed" - CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 - TELEGRAF_DISK_METRICS = "container.azm.ms/disk" - OMSAGENT_ZERO_FILL = "omsagent" - KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 + OBJECT_NAME_K8S_CONTAINER = "K8SContainer" + OBJECT_NAME_K8S_NODE = "K8SNode" + CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" + CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" + MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" + MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" + DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + CONTROLLER_KIND_JOB = "job" + CONTAINER_TERMINATION_REASON_COMPLETED = "completed" + CONTAINER_STATE_TERMINATED = "terminated" + STALE_JOB_TIME_IN_MINUTES = 360 + TELEGRAF_DISK_METRICS = "container.azm.ms/disk" + OMSAGENT_ZERO_FILL = "omsagent" + KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" - #Telemetry constants - CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" - POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" - CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" - PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" - PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" - TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 - KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 - MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" + #Telemetry constants + CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" + POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" + CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" + TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 + KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 + ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30 + MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" - #Pod Statuses - POD_STATUS_TERMINATING = "Terminating" -end \ No newline at end of file + #Pod Statuses + POD_STATUS_TERMINATING = "Terminating" +end