Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 57 additions & 55 deletions source/plugins/ruby/constants.rb
Original file line number Diff line number Diff line change
@@ -1,61 +1,61 @@
# frozen_string_literal: true

class Constants
INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
REASON_OOM_KILLED = "oomkilled"
#Kubestate (common)
INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate"
INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime"
#Kubestate (deployments)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available"
#Kubestate (HPA)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime"
# MDM Metric names
MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount"
MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount"
MDM_POD_READY_PERCENTAGE = "podReadyPercentage"
MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount"
MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage"
MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"
INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms"
INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId"
INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName"
INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor"
INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu"
INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel"
INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId"
INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName"
INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName"
INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace"
INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName"
INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind"
INSIGHTSMETRICS_TAGS_POD_UID = "podUid"
INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv"
INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName"
INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace"
INSIGHTSMETRICS_TAGS_POD_NAME = "podName"
INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes"
INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName"
INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics"
REASON_OOM_KILLED = "oomkilled"
#Kubestate (common)
INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate"
INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime"
#Kubestate (deployments)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated"
INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available"
#Kubestate (HPA)
INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName"
INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas"

INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime"
# MDM Metric names
MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount"
MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount"
MDM_POD_READY_PERCENTAGE = "podReadyPercentage"
MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount"
MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage"
MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage"
MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage"
MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage"
MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage"
MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage"
MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage"
MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage"

CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5
OBJECT_NAME_K8S_CONTAINER = "K8SContainer"
Expand Down Expand Up @@ -88,6 +88,8 @@ class Constants
KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15
ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30
MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour"
MDM_EXCEPTION_TELEMETRY_METRIC = "AKSCustomMetricsMdmExceptions"
MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL = 30

#Pod Statuses
POD_STATUS_TERMINATING = "Terminating"
Expand Down
51 changes: 47 additions & 4 deletions source/plugins/ruby/out_mdm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def initialize
@cluster_identity = nil
@isArcK8sCluster = false
@get_access_token_backoff_expiry = Time.now

@mdm_exceptions_hash = {}
@mdm_exceptions_count = 0
@mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
end

def configure(conf)
Expand Down Expand Up @@ -221,10 +225,49 @@ def format(tag, time, record)
end
end

def exception_aggregator(error)
begin
errorStr = error.to_s
if (@mdm_exceptions_hash[errorStr].nil?)
@mdm_exceptions_hash[errorStr] = 1
else
@mdm_exceptions_hash[errorStr] += 1
end
#Keeping track of all exceptions to send the total in the last flush interval as a metric
@mdm_exceptions_count += 1
rescue => error
@log.info "Error in MDM exception_aggregator method: #{error}"
ApplicationInsightsUtility.sendExceptionTelemetry(error)
end
end

def flush_mdm_exception_telemetry
begin
#Flush out exception telemetry as a metric for the last 30 minutes
timeDifference = (DateTime.now.to_time.to_i - @mdm_exception_telemetry_time_tracker).abs
timeDifferenceInMinutes = timeDifference / 60
if (timeDifferenceInMinutes >= Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL)
telemetryProperties = {}
telemetryProperties["ExceptionsHashForFlushInterval"] = @mdm_exceptions_hash.to_json
telemetryProperties["FlushInterval"] = Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL
ApplicationInsightsUtility.sendMetricTelemetry(Constants::MDM_EXCEPTION_TELEMETRY_METRIC, @mdm_exceptions_count, telemetryProperties)
# Resetting values after flushing
@mdm_exceptions_count = 0
@mdm_exceptions_hash = {}
@mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i
end
rescue => error
@log.info "Error in flush_mdm_exception_telemetry method: #{error}"
ApplicationInsightsUtility.sendExceptionTelemetry(error)
end
end

# This method is called every flush interval. Send the buffer chunk to MDM.
# 'chunk' is a buffer chunk that includes multiple formatted records
def write(chunk)
begin
# Adding this before trying to flush out metrics, since adding after can lead to metrics never being sent
flush_mdm_exception_telemetry
if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm
post_body = []
chunk.msgpack_each { |(tag, record)|
Expand All @@ -247,7 +290,8 @@ def write(chunk)
end
end
rescue Exception => e
ApplicationInsightsUtility.sendExceptionTelemetry(e)
# Adding exceptions to hash to aggregate and send telemetry for all write errors
exception_aggregator(e)
@log.info "Exception when writing to MDM: #{e}"
raise e
end
Expand Down Expand Up @@ -282,7 +326,6 @@ def send_to_mdm(post_body)
else
@log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}"
end
#@log.info "MDM request : #{post_body}"
@log.debug_backtrace(e.backtrace)
if !response.code.empty? && response.code == 403.to_s
@log.info "Response Code #{response.code} Updating @last_post_attempt_time"
Expand All @@ -297,15 +340,15 @@ def send_to_mdm(post_body)
@log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}"
raise e
end
# Adding exceptions to hash to aggregate and send telemetry for all 400 error codes
exception_aggregator(e)
rescue Errno::ETIMEDOUT => e
@log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(e)
raise e
rescue Exception => e
@log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}"
@log.debug_backtrace(e.backtrace)
ApplicationInsightsUtility.sendExceptionTelemetry(e)
raise e
end
end
Expand Down