From 3562da73d046d5792099a2ba0d65a2f011a5f62b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 4 Apr 2019 13:35:37 -0700 Subject: [PATCH 1/2] Fix omsagent crash Error when kube-api returns non-200, send events for HTTP Errors --- source/code/plugin/filter_cadvisor2mdm.rb | 104 ++++++++++++---------- source/code/plugin/out_mdm.rb | 3 + 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index 94f2107cc..df8e108db 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -10,45 +10,45 @@ module Fluent class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter('filter_cadvisor2mdm', self) - + config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log' config_param :custom_metrics_azure_regions, :string config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes' - + @@cpu_usage_milli_cores = 'cpuUsageMillicores' @@cpu_usage_nano_cores = 'cpuusagenanocores' @@object_name_k8s_node = 'K8SNode' @@hostName = (OMS::Common.get_hostname) @@custom_metrics_template = ' - { - "time": "%{timestamp}", - "data": { - "baseData": { - "metric": "%{metricName}", - "namespace": "Insights.Container/nodes", - "dimNames": [ + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "Insights.Container/nodes", + "dimNames": [ "host" - ], - "series": [ - { - "dimValues": [ + ], + "series": [ + { + "dimValues": [ "%{hostvalue}" - ], + ], "min": %{metricminvalue}, - "max": %{metricmaxvalue}, - "sum": %{metricsumvalue}, - "count": 1 - } - ] - } - } + "max": %{metricmaxvalue}, + "sum": %{metricsumvalue}, + "count": 1 + } + ] + } + } }' - + @@metric_name_metric_percentage_name_hash = { - @@cpu_usage_milli_cores => "cpuUsagePercentage", + @@cpu_usage_milli_cores => "cpuUsagePercentage", "memoryRssBytes" => "memoryRssPercentage", - "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" + "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" } @process_incoming_stream = true @@ -61,7 +61,7 @@ def initialize def configure(conf) super @log = nil - + if @enable_log @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_cadvisor2mdm plugin'} @@ -70,15 +70,20 @@ def configure(conf) def start super - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) - @metrics_to_collect_hash = build_metrics_hash - @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" - - # initialize cpu and memory limit - if @process_incoming_stream - @cpu_capacity = 0.0 - @memory_capacity = 0.0 - ensure_cpu_memory_capacity_set + begin + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @metrics_to_collect_hash = build_metrics_hash + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + + # initialize cpu and memory limit + if @process_incoming_stream + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + ensure_cpu_memory_capacity_set + end + rescue => e + @log.info "Error initializing plugin #{e}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end end @@ -117,9 +122,9 @@ def filter(tag, time, record) if @memory_capacity != 0.0 percentage_metric_value = metric_value*100/@memory_capacity end - end + end return get_metric_records(record, metric_name, metric_value, percentage_metric_value) - else + else return [] end else @@ -140,13 +145,13 @@ def ensure_cpu_memory_capacity_set return end - begin + begin nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes?fieldSelector=metadata.name%3D#{@@hostName}").body) rescue Exception => e @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end - if !nodeInventory.nil? + if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] @@ -163,7 +168,7 @@ def ensure_cpu_memory_capacity_set end end end - + def get_metric_records(record, metric_name, metric_value, percentage_metric_value) records = [] custommetricrecord = @@custom_metrics_template % { @@ -194,20 +199,21 @@ def get_metric_records(record, metric_name, metric_value, percentage_metric_valu return records end - + def filter_stream(tag, es) new_es = MultiEventStream.new - ensure_cpu_memory_capacity_set - es.each { |time, record| - begin + begin + ensure_cpu_memory_capacity_set + es.each { |time, record| filtered_records = filter(tag, time, record) - filtered_records.each {|filtered_record| + filtered_records.each {|filtered_record| new_es.add(time, filtered_record) if filtered_record - } if filtered_records - rescue => e - router.emit_error_event(tag, time, record, e) - end - } + } if filtered_records + } + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + @log.info "#{e}" + end new_es end end diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 963069858..5e905d842 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -140,6 +140,7 @@ def write(chunk) end end rescue Exception => e + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @log.info "Exception when writing to MDM: #{e}" raise e end @@ -167,10 +168,12 @@ def send_to_mdm(post_body) # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPostError-#{response.code} ", {}) @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPostError-#{response.code} ", {}) raise e end rescue Errno::ETIMEDOUT => e From 136e76275d9fce7aff8835295b48faf64408540b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 5 Apr 2019 11:17:53 -0700 Subject: [PATCH 2/2] Fixing the bug, deferring telemetry changes for later --- source/code/plugin/filter_cadvisor2mdm.rb | 4 +--- source/code/plugin/out_mdm.rb | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index df8e108db..a6e643e45 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -83,7 +83,6 @@ def start end rescue => e @log.info "Error initializing plugin #{e}" - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end end @@ -211,8 +210,7 @@ def filter_stream(tag, es) } if filtered_records } rescue => e - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) - @log.info "#{e}" + @log.info "Error in filter_stream #{e.message}" end new_es end diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 5e905d842..351198afe 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -164,16 +164,13 @@ def send_to_mdm(post_body) @log.info "Response Code #{response.code} Updating @last_post_attempt_time" @last_post_attempt_time = Time.now @first_post_attempt_made = true - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPostError-#{response.code} ", {}) @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPostError-#{response.code} ", {}) raise e end rescue Errno::ETIMEDOUT => e